From: sdong Date: Thu, 17 Apr 2014 22:14:04 +0000 (-0700) Subject: Fix bugs introduced by D17961 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=034b494774c66738b9c21402d95bb6f4f09b3cd5;p=rocksdb.git Fix bugs introduced by D17961 Summary: D17961 has two bugs: (1) two level iterator fails to populate FileMetaData.table_reader, causing performance regression. (2) table cache handle the !status.ok() case in the wrong place, causing seg fault which shouldn't happen. Test Plan: make all check Reviewers: ljin, igor, haobo Reviewed By: ljin CC: yhchiang, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D17991 Conflicts: db/version_set.cc --- 034b494774c66738b9c21402d95bb6f4f09b3cd5 diff --git a/.arcconfig b/.arcconfig new file mode 100644 index 00000000..85ca38f2 --- /dev/null +++ b/.arcconfig @@ -0,0 +1,10 @@ +{ + "project_id" : "rocksdb", + "conduit_uri" : "https://reviews.facebook.net/", + "copyright_holder" : "Facebook", + "load" : [ + "linters" + ], + "lint.engine" : "FacebookFbcodeLintEngine", + "lint.engine.single.linter" : "FbcodeCppLinter" +} diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..7c279811 --- /dev/null +++ b/.clang-format @@ -0,0 +1,5 @@ +# Complete list of style options can be found at: +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +BasedOnStyle: Google +... diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..a3a70ee3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +build_config.mk + +*.a +*.arc +*.d +*.dylib* +*.gcda +*.gcno +*.o +*.so +*.so.* +*_test +*_bench +*_stress +*.out +*.class +*.jar +*.*jnilib* +*.d-e + +ldb +manifest_dump +sst_dump +util/build_version.cc +build_tools/VALGRIND_LOGS/ +coverage/COVERAGE_REPORT +.gdbhistory +.phutil_module_cache +tags +java/*.log +java/include/org_rocksdb_*.h diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..7a94c7f6 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing to RocksDB + +## Contributor License Agreement ("CLA") + +In order to accept your pull request, we need you to submit a CLA. You +only need to do this once, so if you've done this for another Facebook +open source project, you're good to go. If you are submitting a pull +request for the first time, just let us know that you have completed +the CLA and we can cross-check with your GitHub username. + +Complete your CLA here: + +If you don't have a Facebook account, we can send you a PDF that you can +sign offline. Send us an e-mail or create a new github issue to +request the CLA in PDF format. + +## License + +By contributing to RocksDB, you agree that your contributions will be +licensed under the [BSD License](LICENSE). diff --git a/HISTORY.md b/HISTORY.md new file mode 100644 index 00000000..779cd379 --- /dev/null +++ b/HISTORY.md @@ -0,0 +1,72 @@ +# Rocksdb Change Log + +## Unreleased + +### Public API changes + +## 2.8.0 (04/04/2014) + +* Removed arena.h from public header files. +* By default, checksums are verified on every read from database +* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false +* Added is_manual_compaction to CompactionFilter::Context +* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op. +* Removed BackupEngine::DeleteBackupsNewerThan() function +* Added new option -- verify_checksums_in_compaction +* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership) + Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly) +* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools +* Added a command "checkconsistency" in ldb tool, which checks + if file system state matches DB state (file existence and file sizes) +* Separate options related to block based table to a new struct BlockBasedTableOptions +* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy +* Add more counters to perf context. +* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table. + +### New Features +* If we find one truncated record at the end of the MANIFEST or WAL files, + we will ignore it. We assume that writers of these records were interrupted + and that we can safely ignore it. +* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory(). +* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory(). +* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands. +* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. +* Geo-spatial support for locations and radial-search. + +## 2.7.0 (01/28/2014) + +### Public API changes + +* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`. +* Renamed `WriteBatch::Data()` `const std::string& Data() const`. +* Renamed class `TableStats` to `TableProperties`. +* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead. +* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`. +* Added `DB::GetOptions()`. +* Added `DB::GetDbIdentity()`. + +### New Features + +* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F) +* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that + doesn't create a snapshot (can be used to read newly inserted data) + and is optimized for doing sequential reads. +* Added property block for table, which allows (1) a table to store + its metadata and (2) end user to collect and store properties they + are interested in. +* Enabled caching index and filter block in block cache (turned off by default). +* Supported error report when doing manual compaction. +* Supported additional Linux platform flavors and Mac OS. +* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)` +* Bug fixes and code refactor for compatibility with upcoming Column + Family feature. + +### Performance Improvements + +* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1] +* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99 +* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow +* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads +* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9) +* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c) +* Lots of efforts to move malloc, memcpy and IO outside of locks diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 00000000..2a91be69 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,80 @@ +## Dependencies + +RocksDB is developed on Linux (CentOS release 5.2), with gcc 4.8.1. +It depends on gcc with C++11 support. + +* RocksDB depends on the following libraries: + - [zlib](http://www.zlib.net/) - a library for data compression. + - [bzip2](http://www.bzip.org/) - a library for data compression. + - [snappy](https://code.google.com/p/snappy/) - a library for fast + data compression. + - [gflags](https://code.google.com/p/gflags/) - a library that handles + command line flags processing. + +RocksDB will successfully compile without the compression libraries included, +but some things may fail. We do not support releases without the compression +libraries. You are on your own. + +## Supported platforms + +* **Linux - Ubuntu** + * Upgrade your gcc to version at least 4.7 to get C++11 support. + * Install gflags. First, try: `sudo apt-get install libgflags-dev` + If this doesn't work and you're using Ubuntu, here's a nice tutorial: + (http://askubuntu.com/questions/312173/installing-gflags-12-04) + * Install snappy. This is usually as easy as: + `sudo apt-get install libsnappy-dev`. + * Install zlib. Try: `sudo apt-get install zlib1g-dev`. + * Install bzip2: `sudo apt-get install libbz2-dev`. +* **Linux - CentOS** + * Upgrade your gcc to version at least 4.7 to get C++11 support: + `yum install gcc47-c++` + * Install gflags: + + wget https://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz + tar -xzvf gflags-2.0-no-svn-files.tar.gz + cd gflags-2.0 + ./configure && make && sudo make install + + * Install snappy: + + wget https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz + tar -xzvf snappy-1.1.1.tar.gz + cd snappy-1.1.1 + ./configure && make && sudo make install + + * Install zlib: + + sudo yum install zlib + sudo yum install zlib-devel + + * Install bzip2: + + sudo yum install bzip2 + sudo yum install bzip2-devel + +* **OS X**: + * Install latest C++ compiler that supports C++ 11: + * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). + * Install via [homebrew](http://brew.sh/). + * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. + * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). + * Install zlib, bzip2 and snappy libraries for compression. + * Install gflags. We have included a script + `build_tools/mac-install-gflags.sh`, which should automatically install it. + If you installed gflags by other means (for example, `brew install gflags`), + please set `LIBRARY_PATH` and `CPATH` accordingly. + * Please note that some of the optimizations/features are disabled in OSX. + We did not run any production workloads on it. + +* **iOS**: + * Run: `TARGET_OS=IOS make static_lib` + +## Compilation +`make clean; make` will compile librocksdb.a (RocksDB static library) and all +the unit tests. You can run all unit tests with `make check`. + +For shared library builds, exec `make shared_lib` instead. + +If you followed the above steps and your compile or unit tests fail, +please submit an issue: (https://github.com/facebook/rocksdb/issues) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..b1329018 --- /dev/null +++ b/LICENSE @@ -0,0 +1,35 @@ +BSD License + +For rocksdb software + +Copyright (c) 2014, Facebook, Inc. +All rights reserved. +--------------------------------------------------------------------- + +Copyright (c) 2011 The LevelDB Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..258b951a --- /dev/null +++ b/Makefile @@ -0,0 +1,507 @@ +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. + +# Inherit some settings from environment variables, if available +INSTALL_PATH ?= $(CURDIR) + +#----------------------------------------------- + +ifneq ($(MAKECMDGOALS),dbg) +OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer +else +# intentionally left blank +endif + +ifeq ($(MAKECMDGOALS),shared_lib) +PLATFORM_SHARED_LDFLAGS=-fPIC +endif +#----------------------------------------------- + +# detect what platform we're building on +$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk)) +# this file is generated by the previous line to set build flags and sources +include build_config.mk + +ifneq ($(PLATFORM), IOS) +CFLAGS += -g +CXXFLAGS += -g +else +# no debug info for IOS, that will make our library big +OPT += -DNDEBUG +endif + +# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc. +ifdef COMPILE_WITH_ASAN + # ASAN compile flags + EXEC_LDFLAGS += -fsanitize=address + PLATFORM_CCFLAGS += -fsanitize=address + PLATFORM_CXXFLAGS += -fsanitize=address +else + # if we're not compiling with ASAN, use jemalloc + EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS) + PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC + PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC +endif + +WARNING_FLAGS = -Wall -Werror -Wno-sign-compare +CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual + +LDFLAGS += $(PLATFORM_LDFLAGS) + +LIBOBJECTS = $(SOURCES:.cc=.o) +LIBOBJECTS += $(SOURCESCPP:.cpp=.o) +MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o) + +TESTUTIL = ./util/testutil.o +TESTHARNESS = ./util/testharness.o $(TESTUTIL) +VALGRIND_ERROR = 2 +VALGRIND_DIR = build_tools/VALGRIND_LOGS +VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) +VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full + +TESTS = \ + db_test \ + block_hash_index_test \ + autovector_test \ + table_properties_collector_test \ + arena_test \ + auto_roll_logger_test \ + block_test \ + bloom_test \ + dynamic_bloom_test \ + c_test \ + cache_test \ + coding_test \ + corruption_test \ + crc32c_test \ + dbformat_test \ + env_test \ + blob_store_test \ + filelock_test \ + filename_test \ + filter_block_test \ + histogram_test \ + log_test \ + manual_compaction_test \ + memenv_test \ + merge_test \ + redis_test \ + reduce_levels_test \ + plain_table_db_test \ + prefix_test \ + simple_table_db_test \ + skiplist_test \ + stringappend_test \ + ttl_test \ + backupable_db_test \ + version_edit_test \ + version_set_test \ + write_batch_test\ + deletefile_test \ + table_test \ + thread_local_test \ + geodb_test + +TOOLS = \ + sst_dump \ + db_sanity_test \ + db_stress \ + ldb \ + db_repl_stress \ + blob_store_bench + + +PROGRAMS = db_bench signal_test table_reader_bench $(TESTS) $(TOOLS) +BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench + +# The library name is configurable since we are maintaining libraries of both +# debug/release mode. +ifeq ($(LIBNAME),) + LIBNAME=librocksdb +endif +LIBRARY = ${LIBNAME}.a +MEMENVLIBRARY = libmemenv.a + +default: all + +#----------------------------------------------- +# Create platform independent shared libraries. +#----------------------------------------------- +ifneq ($(PLATFORM_SHARED_EXT),) + +ifneq ($(PLATFORM_SHARED_VERSIONED),true) +SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1) +SHARED3 = $(SHARED1) +SHARED = $(SHARED1) +else +# Update db.h if you change these. +SHARED_MAJOR = 2 +SHARED_MINOR = 0 +SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) +SHARED2 = $(SHARED1).$(SHARED_MAJOR) +SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) +SHARED = $(SHARED1) $(SHARED2) $(SHARED3) +$(SHARED1): $(SHARED3) + ln -fs $(SHARED3) $(SHARED1) +$(SHARED2): $(SHARED3) + ln -fs $(SHARED3) $(SHARED2) +endif + +$(SHARED3): + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES) -o $@ + +endif # PLATFORM_SHARED_EXT + +.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ + release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ + dbg + +all: $(LIBRARY) $(PROGRAMS) + +static_lib: $(LIBRARY) + +shared_lib: $(SHARED) + +dbg: $(LIBRARY) $(PROGRAMS) + +# Will also generate shared libraries. +release: + $(MAKE) clean + OPT="-DNDEBUG -O2" $(MAKE) all -j32 + +coverage: + $(MAKE) clean + COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32 + (cd coverage; ./coverage_test.sh) + # Delete intermediate files + find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + +check: $(PROGRAMS) $(TESTS) $(TOOLS) + for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done + python tools/ldb_test.py + +ldb_tests: all $(PROGRAMS) $(TOOLS) + python tools/ldb_test.py + +crash_test: blackbox_crash_test whitebox_crash_test + +blackbox_crash_test: db_stress + python -u tools/db_crashtest.py + +whitebox_crash_test: db_stress + python -u tools/db_crashtest2.py + +asan_check: + $(MAKE) clean + COMPILE_WITH_ASAN=1 $(MAKE) check -j32 + $(MAKE) clean + +asan_crash_test: + $(MAKE) clean + COMPILE_WITH_ASAN=1 $(MAKE) crash_test -j32 + $(MAKE) clean + +valgrind_check: all $(PROGRAMS) $(TESTS) + mkdir -p $(VALGRIND_DIR) + echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \ + echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \ + for t in $(filter-out skiplist_test,$(TESTS)); do \ + stime=`date '+%s'`; \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + if [ $$? -eq $(VALGRIND_ERROR) ] ; then \ + echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \ + fi; \ + etime=`date '+%s'`; \ + echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \ + done + +clean: + -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk + -rm -rf ios-x86/* ios-arm/* + -find . -name "*.[od]" -exec rm {} \; + -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; +tags: + ctags * -R + cscope -b `find . -name '*.cc'` `find . -name '*.h'` + +format: + build_tools/format-diff.sh + +# --------------------------------------------------------------------------- +# Unit tests and tools +# --------------------------------------------------------------------------- +$(LIBRARY): $(LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(LIBOBJECTS) + +db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lsqlite3 $(COVERAGEFLAGS) + +db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lkyotocabinet $(COVERAGEFLAGS) + +signal_test: util/signal_test.o $(LIBOBJECTS) + $(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL) + $(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) + +thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg + +plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg + +perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + +geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +$(MEMENVLIBRARY) : $(MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(MEMENVOBJECTS) + +memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS) + $(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) + $(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +sst_dump: tools/sst_dump.o $(LIBOBJECTS) + $(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +ldb: tools/ldb.o $(LIBOBJECTS) + $(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +# --------------------------------------------------------------------------- +# Jni stuff +# --------------------------------------------------------------------------- +JNI_NATIVE_SOURCES = ./java/rocksjni/rocksjni.cc ./java/rocksjni/options.cc ./java/rocksjni/write_batch.cc + +JAVA_INCLUDE = -I/usr/lib/jvm/java-openjdk/include/ -I/usr/lib/jvm/java-openjdk/include/linux +ROCKSDBJNILIB = ./java/librocksdbjni.so + +ifeq ($(PLATFORM), OS_MACOSX) +ROCKSDBJNILIB = ./java/librocksdbjni.jnilib +JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ +endif + +jni: clean + OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32 + cd java;$(MAKE) java; + rm -f $(ROCKSDBJNILIB) + $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o $(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(LDFLAGS) $(COVERAGEFLAGS) + +jclean: + cd java;$(MAKE) clean; + rm -f $(ROCKSDBJNILIB) + +jtest: + cd java;$(MAKE) sample;$(MAKE) test; + +# --------------------------------------------------------------------------- +# Platform-specific compilation +# --------------------------------------------------------------------------- + +ifeq ($(PLATFORM), IOS) +# For iOS, create universal object files to be used on both the simulator and +# a device. +PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms +SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer +DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer +IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString) + +.cc.o: + mkdir -p ios-x86/$(dir $@) + $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +.c.o: + mkdir -p ios-x86/$(dir $@) + $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ + mkdir -p ios-arm/$(dir $@) + xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@ + lipo ios-x86/$@ ios-arm/$@ -create -output $@ + +else +.cc.o: + $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) + +.c.o: + $(CC) $(CFLAGS) -c $< -o $@ +endif + +# --------------------------------------------------------------------------- +# Source files dependencies detection +# --------------------------------------------------------------------------- + +# Add proper dependency support so changing a .h file forces a .cc file to +# rebuild. + +# The .d file indicates .cc file's dependencies on .h files. We generate such +# dependency by g++'s -MM option, whose output is a make dependency rule. +# The sed command makes sure the "target" file in the generated .d file has +# the correct path prefix. +%.d: %.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@ +ifeq ($(PLATFORM), OS_MACOSX) + @sed -i '' -e 's,.*:,$*.o:,' $@ +else + @sed -i -e 's,.*:,$*.o:,' $@ +endif + +DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d)) + +depend: $(DEPFILES) + +# if the make goal is either "clean" or "format", we shouldn't +# try to import the *.d files. +# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly +# working solution. +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),format) +ifneq ($(MAKECMDGOALS),jclean) +ifneq ($(MAKECMDGOALS),jtest) +-include $(DEPFILES) +endif +endif +endif +endif diff --git a/PATENTS b/PATENTS new file mode 100644 index 00000000..8a6fca4d --- /dev/null +++ b/PATENTS @@ -0,0 +1,23 @@ +Additional Grant of Patent Rights + +“Software” means the rocksdb software distributed by Facebook, Inc. + +Facebook hereby grants you a perpetual, worldwide, royalty-free, +non-exclusive, irrevocable (subject to the termination provision below) +license under any rights in any patent claims owned by Facebook, to make, +have made, use, sell, offer to sell, import, and otherwise transfer the +Software. For avoidance of doubt, no license is granted under Facebook’s +rights in any patent claims that are infringed by (i) modifications to the +Software made by you or a third party, or (ii) the Software in combination +with any software or other technology provided by you or a third party. + +The license granted hereunder will terminate, automatically and without +notice, for anyone that makes any claim (including by filing any lawsuit, +assertion or other action) alleging (a) direct, indirect, or contributory +infringement or inducement to infringe any patent: (i) by Facebook or any +of its subsidiaries or affiliates, whether or not such claim is related +to the Software, (ii) by any party if such claim arises in whole or in +part from any software, product or service of Facebook or any of its +subsidiaries or affiliates, whether or not such claim is related to the +Software, or (iii) by any party relating to the Software; or (b) that +any right in any patent claim of Facebook is invalid or unenforceable. diff --git a/README b/README new file mode 100644 index 00000000..473e4145 --- /dev/null +++ b/README @@ -0,0 +1,82 @@ +rocksdb: A persistent key-value store for flash storage +Authors: * The Facebook Database Engineering Team + * Build on earlier work on leveldb by Sanjay Ghemawat + (sanjay@google.com) and Jeff Dean (jeff@google.com) + +This code is a library that forms the core building block for a fast +key value server, especially suited for storing data on flash drives. +It has an Log-Structured-Merge-Database (LSM) design with flexible tradeoffs +between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF) +and Space-Amplification-Factor(SAF). It has multi-threaded compactions, +making it specially suitable for storing multiple terabytes of data in a +single database. + +The core of this code has been derived from open-source leveldb. + +The code under this directory implements a system for maintaining a +persistent key/value store. + +See doc/index.html and github wiki (https://github.com/facebook/rocksdb/wiki) +for more explanation. + +The public interface is in include/*. Callers should not include or +rely on the details of any other header files in this package. Those +internal APIs may be changed without warning. + +Guide to header files: + +include/rocksdb/db.h + Main interface to the DB: Start here + +include/rocksdb/options.h + Control over the behavior of an entire database, and also + control over the behavior of individual reads and writes. + +include/rocksdb/comparator.h + Abstraction for user-specified comparison function. If you want + just bytewise comparison of keys, you can use the default comparator, + but clients can write their own comparator implementations if they + want custom ordering (e.g. to handle different character + encodings, etc.) + +include/rocksdb/iterator.h + Interface for iterating over data. You can get an iterator + from a DB object. + +include/rocksdb/write_batch.h + Interface for atomically applying multiple updates to a database. + +include/rocksdb/slice.h + A simple module for maintaining a pointer and a length into some + other byte array. + +include/rocksdb/status.h + Status is returned from many of the public interfaces and is used + to report success and various kinds of errors. + +include/rocksdb/env.h + Abstraction of the OS environment. A posix implementation of + this interface is in util/env_posix.cc + +include/rocksdb/table_builder.h + Lower-level modules that most clients probably won't use directly + +include/rocksdb/cache.h + An API for the block cache. + +include/rocksdb/compaction_filter.h + An API for a application filter invoked on every compaction. + +include/rocksdb/filter_policy.h + An API for configuring a bloom filter. + +include/rocksdb/memtablerep.h + An API for implementing a memtable. + +include/rocksdb/statistics.h + An API to retrieve various database statistics. + +include/rocksdb/transaction_log.h + An API to retrieve transaction logs from a database. + +Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ diff --git a/README.fb b/README.fb new file mode 100644 index 00000000..d3cc4110 --- /dev/null +++ b/README.fb @@ -0,0 +1,3 @@ +* Detailed instructions on how to compile using fbcode and jemalloc + +* Latest release is 2.7.fb diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform new file mode 100755 index 00000000..94aafd62 --- /dev/null +++ b/build_tools/build_detect_platform @@ -0,0 +1,311 @@ +#!/bin/sh +# +# Detects OS we're compiling on and outputs a file specified by the first +# argument, which in turn gets read while processing Makefile. +# +# The output will set the following variables: +# CC C Compiler path +# CXX C++ Compiler path +# PLATFORM_LDFLAGS Linker flags +# PLATFORM_SHARED_EXT Extension for shared libraries +# PLATFORM_SHARED_LDFLAGS Flags for building shared library +# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library +# PLATFORM_CCFLAGS C compiler flags +# PLATFORM_CXXFLAGS C++ compiler flags. Will contain: +# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned +# shared libraries, empty otherwise. +# +# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following: +# +# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present +# -DLEVELDB_PLATFORM_NOATOMIC if it is not +# -DSNAPPY if the Snappy library is present +# -DLZ4 if the LZ4 library is present +# +# Using gflags in rocksdb: +# Our project depends on gflags, which requires users to take some extra steps +# before they can compile the whole repository: +# 1. Install gflags. You may download it from here: +# https://code.google.com/p/gflags/ +# 2. Once install, add the include path/lib path for gflags to CPATH and +# LIBRARY_PATH respectively. If installed with default mode, the +# lib and include path will be /usr/local/lib and /usr/local/include +# Mac user can do this by running build_tools/mac-install-gflags.sh + +OUTPUT=$1 +if test -z "$OUTPUT"; then + echo "usage: $0 " >&2 + exit 1 +fi + +# we depend on C++11 +PLATFORM_CXXFLAGS="-std=c++11" +# we currently depend on POSIX platform +COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" + +# Default to fbcode gcc on internal fb machines +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then + FBCODE_BUILD="true" + if [ -z "$USE_CLANG" ]; then + CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ + $(rpm -q --whatprovides redhat-release)` + if [ "$CENTOS_VERSION" = "6" ]; then + source $PWD/build_tools/fbcode.gcc481.sh + else + source $PWD/build_tools/fbcode.gcc471.sh + fi + else + source $PWD/build_tools/fbcode.clang31.sh + fi +fi + +# Delete existing output, if it exists +rm -f $OUTPUT +touch $OUTPUT + +if test -z "$CC"; then + CC=cc +fi + +if test -z "$CXX"; then + CXX=g++ +fi + +# Detect OS +if test -z "$TARGET_OS"; then + TARGET_OS=`uname -s` +fi + +COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" +CROSS_COMPILE= +PLATFORM_CCFLAGS= +PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" +PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS" +PLATFORM_SHARED_EXT="so" +PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," +PLATFORM_SHARED_CFLAGS="-fPIC" +PLATFORM_SHARED_VERSIONED=false + +# generic port files (working on all platform by #ifdef) go directly in /port +GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "` + +# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp +case "$TARGET_OS" in + Darwin) + PLATFORM=OS_MACOSX + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " + # PORT_FILES=port/darwin/darwin_specific.cc + ;; + IOS) + PLATFORM=IOS + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " + CROSS_COMPILE=true + ;; + Linux) + PLATFORM=OS_LINUX + COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX" + if [ -z "$USE_CLANG" ]; then + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp" + fi + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/linux/linux_specific.cc + ;; + SunOS) + PLATFORM=OS_SOLARIS + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/sunos/sunos_specific.cc + ;; + FreeBSD) + PLATFORM=OS_FREEBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + # PORT_FILES=port/freebsd/freebsd_specific.cc + ;; + NetBSD) + PLATFORM=OS_NETBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s" + # PORT_FILES=port/netbsd/netbsd_specific.cc + ;; + OpenBSD) + PLATFORM=OS_OPENBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread" + # PORT_FILES=port/openbsd/openbsd_specific.cc + ;; + DragonFly) + PLATFORM=OS_DRAGONFLYBSD + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + # PORT_FILES=port/dragonfly/dragonfly_specific.cc + ;; + OS_ANDROID_CROSSCOMPILE) + PLATFORM=OS_ANDROID + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library + # PORT_FILES=port/android/android.cc + CROSS_COMPILE=true + ;; + *) + echo "Unknown platform!" >&2 + exit 1 +esac + +$PWD/build_tools/build_detect_version + +# We want to make a list of all cc files within util, db, table, and helpers +# except for the test and benchmark files. By default, find will output a list +# of all files matching either rule, so we need to append -print to make the +# prune take effect. +DIRS="util db table utilities" + +set -f # temporarily disable globbing so that our patterns arent expanded +PRUNE_TEST="-name *test*.cc -prune" +PRUNE_BENCH="-name *_bench.cc -prune" +PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` +PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "` +set +f # re-enable globbing + +# The sources consist of the portable files, plus the platform-specific port +# file. +echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT +echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT +echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT + +if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then + # Cross-compiling; do not try any compilation tests. + # Also don't need any compilation tests if compiling on fbcode + true +else + # do fPIC on 64 bit in non-fbcode environment + case "$TARGET_OS" in + x86_64) + PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -fPIC" + esac + + # If -std=c++0x works, use . Otherwise use port_posix.h. + $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" + fi + + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, 0, 0, 1024); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi + + # Test whether Snappy library is installed + # http://code.google.com/p/snappy/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy" + fi + + + # Test whether gflags library is installed + # http://code.google.com/p/gflags/ + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + fi + + # Test whether zlib library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DZLIB" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz" + fi + + # Test whether bzip library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DBZIP2" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2" + fi + + # Test whether lz4 library is installed + $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + #include + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DLZ4" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4" + fi + + # Test whether tcmalloc is available + $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <> $OUTPUT +echo "CXX=$CXX" >> $OUTPUT +echo "PLATFORM=$PLATFORM" >> $OUTPUT +echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT +echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT +echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT +echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT +echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT +echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT +echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT +echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> $OUTPUT +echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> $OUTPUT diff --git a/build_tools/build_detect_version b/build_tools/build_detect_version new file mode 100755 index 00000000..f7d711f0 --- /dev/null +++ b/build_tools/build_detect_version @@ -0,0 +1,22 @@ +#!/bin/sh +# +# Record the version of the source that we are compiling. +# We keep a record of the git revision in util/version.cc. This source file +# is then built as a regular source file as part of the compilation process. +# One can run "strings executable_filename | grep _build_" to find the version of +# the source that we used to build the executable file. + +OUTFILE="$PWD/util/build_version.cc" + +GIT_SHA="" +if command -v git >/dev/null 2>&1; then + GIT_SHA=$(git rev-parse HEAD 2>/dev/null) +fi + +cat > "${OUTFILE}" < /dev/null +then + echo "You didn't have clang-format-diff.py available in your computer!" + echo "You can download it by running: " + echo " curl http://goo.gl/iUW1u2" + exit 128 +fi + +# Check argparse, a library that clang-format-diff.py requires. +python 2>/dev/null << EOF +import argparse +EOF + +if [ "$?" != 0 ] +then + echo "To run clang-format-diff.py, we'll need the library "argparse" to be" + echo "installed. You can try either of the follow ways to install it:" + echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" + echo " 2. easy_install argparse (if you have easy_install)" + echo " 3. pip install argparse (if you have pip)" + exit 129 +fi + +# TODO(kailiu) following work is not complete since we still need to figure +# out how to add the modified files done pre-commit hook to git's commit index. +# +# Check if this script has already been added to pre-commit hook. +# Will suggest user to add this script to pre-commit hook if their pre-commit +# is empty. +# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit" +# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null +# then +# echo "Would you like to add this script to pre-commit hook, which will do " +# echo -n "the format check for all the affected lines before you check in (y/n):" +# read add_to_hook +# if [ "$add_to_hook" == "y" ] +# then +# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH +# fi +# fi +set -e + +uncommitted_code=`git diff HEAD` + +# If there's no uncommitted changes, we assume user are doing post-commit +# format check, in which case we'll check the modified lines from latest commit. +# Otherwise, we'll check format of the uncommitted code only. +if [ -z "$uncommitted_code" ] +then + # Check the format of last commit + diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1) +else + # Check the format of uncommitted lines, + diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) +fi + +if [ -z "$diffs" ] +then + echo "Nothing needs to be reformatted!" + exit 0 +fi + +# Highlight the insertion/deletion from the clang-format-diff.py's output +COLOR_END="\033[0m" +COLOR_RED="\033[0;31m" +COLOR_GREEN="\033[0;32m" + +echo -e "Detect lines that doesn't follow the format rules:\r" +# Add the color to the diff. lines added will be green; lines removed will be red. +echo "$diffs" | + sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | + sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" +echo -e "Would you like to fix the format automatically (y/n): \c" + +# Make sure under any mode, we can read user input. +exec < /dev/tty +read to_fix + +if [ "$to_fix" != "y" ] +then + exit 1 +fi + +# Do in-place format adjustment. +git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 +echo "Files reformatted!" + +# Amend to last commit if user do the post-commit format check +if [ -z "$uncommitted_code" ]; then + echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c" + read to_amend + + if [ "$to_amend" == "y" ] + then + git commit -a --amend --reuse-message HEAD + echo "Amended to last commit" + fi +fi diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh new file mode 100755 index 00000000..ef0339c3 --- /dev/null +++ b/build_tools/mac-install-gflags.sh @@ -0,0 +1,25 @@ +#!/bin/sh +# Install gflags for mac developers. + +set -e + +DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX` + +cd $DIR +wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz +tar xvfz gflags-2.0.tar.gz +cd gflags-2.0 + +./configure +make +make install + +# Add include/lib path for g++ +echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile +echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile + +echo "" +echo "-----------------------------------------------------------------------------" +echo "| Installation Completed |" +echo "-----------------------------------------------------------------------------" +echo "Please run `. ~/bash_profile` to be able to compile with gflags" diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh new file mode 100755 index 00000000..1440a4fd --- /dev/null +++ b/build_tools/make_new_version.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +set -e +if [ -z "$GIT" ] +then + GIT="git" +fi + +# Print out the colored progress info so that it can be brainlessly +# distinguished by users. +function title() { + echo -e "\033[1;32m$*\033[0m" +} + +usage="Create new RocksDB version and prepare it for the release process\n" +usage+="USAGE: ./make_new_version.sh " + +# -- Pre-check +if [[ $# < 1 ]]; then + echo -e $usage + exit 1 +fi + +ROCKSDB_VERSION=$1 + +GIT_BRANCH=`git rev-parse --abbrev-ref HEAD` +if [ $GIT_BRANCH != "master" ]; then + echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch." +fi + +title "Adding new tag for this release ..." +TAG="$ROCKSDB_VERSION.fb" +$GIT tag -a "$TAG" -m "RocksDB $ROCKSDB_VERSION" + +# Setting up the proxy for remote repo access +title "Pushing new tag to remote repo ..." +$GIT push origin --tags + +title "Tag $TAG is pushed to github; if you want to delete it, please run" +title "git tags -d $TAG && git push origin :refs/tags/$TAG" diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh new file mode 100755 index 00000000..58766f5d --- /dev/null +++ b/build_tools/regression_build_test.sh @@ -0,0 +1,330 @@ +#!/bin/bash + +set -e + +NUM=10000000 + +if [ $# -eq 1 ];then + DATA_DIR=$1 +elif [ $# -eq 2 ];then + DATA_DIR=$1 + STAT_FILE=$2 +fi + +# On the production build servers, set data and stat +# files/directories not in /tmp or else the tempdir cleaning +# scripts will make you very unhappy. +DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)} +STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)} + +function cleanup { + rm -rf $DATA_DIR + rm -f $STAT_FILE.fillseq + rm -f $STAT_FILE.readrandom + rm -f $STAT_FILE.overwrite + rm -f $STAT_FILE.memtablefillreadrandom +} + +trap cleanup EXIT + +if [ -z $GIT_BRANCH ]; then + git_br=`git rev-parse --abbrev-ref HEAD` +else + git_br=$(basename $GIT_BRANCH) +fi + +if [ $git_br == "master" ]; then + git_br="" +else + git_br="."$git_br +fi + +make release + +# measure fillseq + fill up the DB for overwrite benchmark +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 > ${STAT_FILE}.fillseq + +# measure overwrite performance +./db_bench \ + --benchmarks=overwrite \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$((NUM / 10)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=8 > ${STAT_FILE}.overwrite + +# fill up the db for readrandom benchmark (1GB total size) +./db_bench \ + --benchmarks=fillseq \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$NUM \ + --writes=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# measure readrandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom + +# measure readrandom with 6GB block cache and tailing iterator +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --use_tailing_iterator=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandomtailing + +# measure readrandom with 100MB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=104857600 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandomsmallblockcache + +# measure readrandom with 8k data in memtable +./db_bench \ + --benchmarks=overwrite,readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --writes=512 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_mem_sst + + +# fill up the db for readrandom benchmark with filluniquerandom (1GB total size) +./db_bench \ + --benchmarks=filluniquerandom \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --writes=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=1 > /dev/null + +# dummy test just to compact the data +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 1000)) \ + --reads=$((NUM / 1000)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > /dev/null + +# measure readrandom after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --disable_auto_compactions=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom + +# measure readwhilewriting after load with filluniquerandom with 6GB block cache +./db_bench \ + --benchmarks=readwhilewriting \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$((NUM / 4)) \ + --reads=$((NUM / 4)) \ + --writes_per_second=1000 \ + --write_buffer_size=100000000 \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readwhilewriting + +# measure memtable performance -- none of the data gets flushed to disk +./db_bench \ + --benchmarks=fillrandom,readrandom, \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$((NUM / 10)) \ + --reads=$NUM \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --write_buffer_size=1000000000 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --value_size=10 \ + --threads=16 > ${STAT_FILE}.memtablefillreadrandom + +# send data to ods +function send_to_ods { + key="$1" + value="$2" + + if [ -z $JENKINS_HOME ]; then + # running on devbox, just print out the values + echo $1 $2 + return + fi + + if [ -z "$value" ];then + echo >&2 "ERROR: Key $key doesn't have a value." + return + fi + curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \ + --connect-timeout 60 +} + +function send_benchmark_to_ods { + bench="$1" + bench_key="$2" + file="$3" + + QPS=$(grep $bench $file | awk '{print $5}') + P50_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $3}' ) + P75_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $5}' ) + P99_MICROS=$(grep $bench $file -A 4 | tail -n1 | awk '{print $7}' ) + + send_to_ods rocksdb.build.$bench_key.qps $QPS + send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS + send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS + send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS +} + +send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite +send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq +send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom +send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing +send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache +send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst +send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom +send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom +send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting diff --git a/build_tools/valgrind_test.sh b/build_tools/valgrind_test.sh new file mode 100755 index 00000000..8c7e5213 --- /dev/null +++ b/build_tools/valgrind_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#A shell script for Jenknis to run valgrind on rocksdb tests +#Returns 0 on success when there are no failed tests + +VALGRIND_DIR=build_tools/VALGRIND_LOGS +make clean +make -j$(nproc) valgrind_check +NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1)) +if [ $NUM_FAILED_TESTS -lt 1 ]; then + echo No tests have valgrind errors + exit 0 +else + cat $VALGRIND_DIR/valgrind_failed_tests + exit 1 +fi diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh new file mode 100755 index 00000000..08dbd05a --- /dev/null +++ b/coverage/coverage_test.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Exit on error. +set -e + +if [ -n "$USE_CLANG" ]; then + echo "Error: Coverage test is supported only for gcc." + exit 1 +fi + +ROOT=".." +# Fetch right version of gcov +if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then + source $ROOT/build_tools/fbcode.gcc471.sh + GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov +else + GCOV=$(which gcov) +fi + +COVERAGE_DIR="$PWD/COVERAGE_REPORT" +mkdir -p $COVERAGE_DIR + +# Find all gcno files to generate the coverage report + +GCNO_FILES=`find $ROOT -name "*.gcno"` +$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | + # Parse the raw gcov report to more human readable form. + python $ROOT/coverage/parse_gcov_output.py | + # Write the output to both stdout and report file. + tee $COVERAGE_DIR/coverage_report_all.txt && +echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n" + +# TODO: we also need to get the files of the latest commits. +# Get the most recently committed files. +LATEST_FILES=` + git show --pretty="format:" --name-only HEAD | + grep -v "^$" | + paste -s -d,` +RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt + +echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT +$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | + python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES | + tee -a $RECENT_REPORT && +echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n" + +# Unless otherwise specified, we'll not generate html report by default +if [ -z "$HTML" ]; then + exit 0 +fi + +# Generate the html report. If we cannot find lcov in this machine, we'll simply +# skip this step. +echo "Generating the html coverage report..." + +LCOV=$(which lcov || true 2>/dev/null) +if [ -z $LCOV ] +then + echo "Skip: Cannot find lcov to generate the html report." + exit 0 +fi + +LCOV_VERSION=$(lcov -v | grep 1.1 || true) +if [ $LCOV_VERSION ] +then + echo "Not supported lcov version. Expect lcov 1.1." + exit 0 +fi + +(cd $ROOT; lcov --no-external \ + --capture \ + --directory $PWD \ + --gcov-tool $GCOV \ + --output-file $COVERAGE_DIR/coverage.info) + +genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR + +echo "HTML Coverage report is generated in $COVERAGE_DIR" diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py new file mode 100644 index 00000000..72e8b072 --- /dev/null +++ b/coverage/parse_gcov_output.py @@ -0,0 +1,118 @@ +import optparse +import re +import sys + +from optparse import OptionParser + +# the gcov report follows certain pattern. Each file will have two lines +# of report, from which we can extract the file name, total lines and coverage +# percentage. +def parse_gcov_report(gcov_input): + per_file_coverage = {} + total_coverage = None + + for line in sys.stdin: + line = line.strip() + + # --First line of the coverage report (with file name in it)? + match_obj = re.match("^File '(.*)'$", line) + if match_obj: + # fetch the file name from the first line of the report. + current_file = match_obj.group(1) + continue + + # -- Second line of the file report (with coverage percentage) + match_obj = re.match("^Lines executed:(.*)% of (.*)", line) + + if match_obj: + coverage = float(match_obj.group(1)) + lines = int(match_obj.group(2)) + + if current_file is not None: + per_file_coverage[current_file] = (coverage, lines) + current_file = None + else: + # If current_file is not set, we reach the last line of report, + # which contains the summarized coverage percentage. + total_coverage = (coverage, lines) + continue + + # If the line's pattern doesn't fall into the above categories. We + # can simply ignore them since they're either empty line or doesn't + # find executable lines of the given file. + current_file = None + + return per_file_coverage, total_coverage + +def get_option_parser(): + usage = "Parse the gcov output and generate more human-readable code " +\ + "coverage report." + parser = OptionParser(usage) + + parser.add_option( + "--interested-files", "-i", + dest="filenames", + help="Comma separated files names. if specified, we will display " + + "the coverage report only for interested source files. " + + "Otherwise we will display the coverage report for all " + + "source files." + ) + return parser + +def display_file_coverage(per_file_coverage, total_coverage): + # To print out auto-adjustable column, we need to know the longest + # length of file names. + max_file_name_length = max( + len(fname) for fname in per_file_coverage.keys() + ) + + # -- Print header + # size of separator is determined by 3 column sizes: + # file name, coverage percentage and lines. + header_template = \ + "%" + str(max_file_name_length) + "s\t%s\t%s" + separator = "-" * (max_file_name_length + 10 + 20) + print header_template % ("Filename", "Coverage", "Lines") + print separator + + # -- Print body + # template for printing coverage report for each file. + record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d" + + for fname, coverage_info in per_file_coverage.items(): + coverage, lines = coverage_info + print record_template % (fname, coverage, lines) + + # -- Print footer + if total_coverage: + print separator + print record_template % ("Total", total_coverage[0], total_coverage[1]) + +def report_coverage(): + parser = get_option_parser() + (options, args) = parser.parse_args() + + interested_files = set() + if options.filenames is not None: + interested_files = set(f.strip() for f in options.filenames.split(',')) + + # To make things simple, right now we only read gcov report from the input + per_file_coverage, total_coverage = parse_gcov_report(sys.stdin) + + # Check if we need to display coverage info for interested files. + if len(interested_files): + per_file_coverage = dict( + (fname, per_file_coverage[fname]) for fname in interested_files + if fname in per_file_coverage + ) + # If we only interested in several files, it makes no sense to report + # the total_coverage + total_coverage = None + + if not len(per_file_coverage): + print >> sys.stderr, "Cannot find coverage info for the given files." + return + display_file_coverage(per_file_coverage, total_coverage) + +if __name__ == "__main__": + report_coverage() diff --git a/db/builder.cc b/db/builder.cc new file mode 100644 index 00000000..ce85ae58 --- /dev/null +++ b/db/builder.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/merge_helper.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based_table_builder.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +class TableFactory; + +TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, + CompressionType compression_type) { + return options.table_factory->NewTableBuilder(options, internal_comparator, + file, compression_type); +} + +Status BuildTable(const std::string& dbname, Env* env, const Options& options, + const EnvOptions& soptions, TableCache* table_cache, + Iterator* iter, FileMetaData* meta, + const InternalKeyComparator& internal_comparator, + const SequenceNumber newest_snapshot, + const SequenceNumber earliest_seqno_in_memtable, + const CompressionType compression) { + Status s; + meta->file_size = 0; + meta->smallest_seqno = meta->largest_seqno = 0; + iter->SeekToFirst(); + + // If the sequence number of the smallest entry in the memtable is + // smaller than the most recent snapshot, then we do not trigger + // removal of duplicate/deleted keys as part of this builder. + bool purge = options.purge_redundant_kvs_while_flush; + if (earliest_seqno_in_memtable <= newest_snapshot) { + purge = false; + } + + std::string fname = TableFileName(dbname, meta->number); + if (iter->Valid()) { + unique_ptr file; + s = env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + + TableBuilder* builder = + NewTableBuilder(options, internal_comparator, file.get(), compression); + + // the first key is the smallest key + Slice key = iter->key(); + meta->smallest.DecodeFrom(key); + meta->smallest_seqno = GetInternalKeySeqno(key); + meta->largest_seqno = meta->smallest_seqno; + + MergeHelper merge(internal_comparator.user_comparator(), + options.merge_operator.get(), options.info_log.get(), + options.min_partial_merge_operands, + true /* internal key corruption is not ok */); + + if (purge) { + // Ugly walkaround to avoid compiler error for release build + bool ok __attribute__((unused)) = true; + + // Will write to builder if current key != prev key + ParsedInternalKey prev_ikey; + std::string prev_key; + bool is_first_key = true; // Also write if this is the very first key + + while (iter->Valid()) { + bool iterator_at_next = false; + + // Get current key + ParsedInternalKey this_ikey; + Slice key = iter->key(); + Slice value = iter->value(); + + // In-memory key corruption is not ok; + // TODO: find a clean way to treat in memory key corruption + ok = ParseInternalKey(key, &this_ikey); + assert(ok); + assert(this_ikey.sequence >= earliest_seqno_in_memtable); + + // If the key is the same as the previous key (and it is not the + // first key), then we skip it, since it is an older version. + // Otherwise we output the key and mark it as the "new" previous key. + if (!is_first_key && !internal_comparator.user_comparator()->Compare( + prev_ikey.user_key, this_ikey.user_key)) { + // seqno within the same key are in decreasing order + assert(this_ikey.sequence < prev_ikey.sequence); + } else { + is_first_key = false; + + if (this_ikey.type == kTypeMerge) { + // Handle merge-type keys using the MergeHelper + // TODO: pass statistics to MergeUntil + merge.MergeUntil(iter, 0 /* don't worry about snapshot */); + iterator_at_next = true; + if (merge.IsSuccess()) { + // Merge completed correctly. + // Add the resulting merge key/value and continue to next + builder->Add(merge.key(), merge.value()); + prev_key.assign(merge.key().data(), merge.key().size()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } else { + // Merge did not find a Put/Delete. + // Can not compact these merges into a kValueType. + // Write them out one-by-one. (Proceed back() to front()) + const std::deque& keys = merge.keys(); + const std::deque& values = merge.values(); + assert(keys.size() == values.size() && keys.size() >= 1); + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + for (key_iter=keys.rbegin(), value_iter = values.rbegin(); + key_iter != keys.rend() && value_iter != values.rend(); + ++key_iter, ++value_iter) { + + builder->Add(Slice(*key_iter), Slice(*value_iter)); + } + + // Sanity check. Both iterators should end at the same time + assert(key_iter == keys.rend() && value_iter == values.rend()); + + prev_key.assign(keys.front()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } + } else { + // Handle Put/Delete-type keys by simply writing them + builder->Add(key, value); + prev_key.assign(key.data(), key.size()); + ok = ParseInternalKey(Slice(prev_key), &prev_ikey); + assert(ok); + } + } + + if (!iterator_at_next) iter->Next(); + } + + // The last key is the largest key + meta->largest.DecodeFrom(Slice(prev_key)); + SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key)); + meta->smallest_seqno = std::min(meta->smallest_seqno, seqno); + meta->largest_seqno = std::max(meta->largest_seqno, seqno); + + } else { + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + meta->largest.DecodeFrom(key); + builder->Add(key, iter->value()); + SequenceNumber seqno = GetInternalKeySeqno(key); + meta->smallest_seqno = std::min(meta->smallest_seqno, seqno); + meta->largest_seqno = std::max(meta->largest_seqno, seqno); + } + } + + // Finish and check for builder errors + if (s.ok()) { + s = builder->Finish(); + if (s.ok()) { + meta->file_size = builder->FileSize(); + assert(meta->file_size > 0); + } + } else { + builder->Abandon(); + } + delete builder; + + // Finish and check for file errors + if (s.ok() && !options.disableDataSync) { + if (options.use_fsync) { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + s = file->Fsync(); + } else { + StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + s = file->Sync(); + } + } + if (s.ok()) { + s = file->Close(); + } + + if (s.ok()) { + // Verify that the table is usable + Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, + internal_comparator, *meta); + s = it->status(); + delete it; + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (s.ok() && meta->file_size > 0) { + // Keep it + } else { + env->DeleteFile(fname); + } + return s; +} + +} // namespace rocksdb diff --git a/db/builder.h b/db/builder.h new file mode 100644 index 00000000..63016296 --- /dev/null +++ b/db/builder.h @@ -0,0 +1,45 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "rocksdb/comparator.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct Options; +struct FileMetaData; + +class Env; +struct EnvOptions; +class Iterator; +class TableCache; +class VersionEdit; +class TableBuilder; +class WritableFile; + +extern TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type); + +// Build a Table file from the contents of *iter. The generated file +// will be named according to meta->number. On success, the rest of +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. +extern Status BuildTable(const std::string& dbname, Env* env, + const Options& options, const EnvOptions& soptions, + TableCache* table_cache, Iterator* iter, + FileMetaData* meta, + const InternalKeyComparator& internal_comparator, + const SequenceNumber newest_snapshot, + const SequenceNumber earliest_seqno_in_memtable, + const CompressionType compression); + +} // namespace rocksdb diff --git a/db/c.cc b/db/c.cc new file mode 100644 index 00000000..b566daf6 --- /dev/null +++ b/db/c.cc @@ -0,0 +1,1469 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/c.h" + +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/statistics.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" + +using rocksdb::Cache; +using rocksdb::Comparator; +using rocksdb::CompressionType; +using rocksdb::DB; +using rocksdb::Env; +using rocksdb::InfoLogLevel; +using rocksdb::FileLock; +using rocksdb::FilterPolicy; +using rocksdb::FlushOptions; +using rocksdb::Iterator; +using rocksdb::Logger; +using rocksdb::MergeOperator; +using rocksdb::NewBloomFilterPolicy; +using rocksdb::NewLRUCache; +using rocksdb::Options; +using rocksdb::RandomAccessFile; +using rocksdb::Range; +using rocksdb::ReadOptions; +using rocksdb::SequentialFile; +using rocksdb::Slice; +using rocksdb::SliceTransform; +using rocksdb::Snapshot; +using rocksdb::Status; +using rocksdb::WritableFile; +using rocksdb::WriteBatch; +using rocksdb::WriteOptions; +using rocksdb::LiveFileMetaData; + +using std::shared_ptr; + +extern "C" { + +struct rocksdb_t { DB* rep; }; +struct rocksdb_iterator_t { Iterator* rep; }; +struct rocksdb_writebatch_t { WriteBatch rep; }; +struct rocksdb_snapshot_t { const Snapshot* rep; }; +struct rocksdb_flushoptions_t { FlushOptions rep; }; +struct rocksdb_readoptions_t { ReadOptions rep; }; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_seqfile_t { SequentialFile* rep; }; +struct rocksdb_randomfile_t { RandomAccessFile* rep; }; +struct rocksdb_writablefile_t { WritableFile* rep; }; +struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_logger_t { shared_ptr rep; }; +struct rocksdb_cache_t { shared_ptr rep; }; +struct rocksdb_livefiles_t { std::vector rep; }; + +struct rocksdb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)( + void*, + const char* a, size_t alen, + const char* b, size_t blen); + const char* (*name_)(void*); + + virtual ~rocksdb_comparator_t() { + (*destructor_)(state_); + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + // No-ops since the C binding does not support key shortening methods. + virtual void FindShortestSeparator(std::string*, const Slice&) const { } + virtual void FindShortSuccessor(std::string* key) const { } +}; + +struct rocksdb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*create_)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length); + unsigned char (*key_match_)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length); + void (*delete_filter_)( + void*, + const char* filter, size_t filter_length); + + virtual ~rocksdb_filterpolicy_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + std::vector key_pointers(n); + std::vector key_sizes(n); + for (int i = 0; i < n; i++) { + key_pointers[i] = keys[i].data(); + key_sizes[i] = keys[i].size(); + } + size_t len; + char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len); + dst->append(filter, len); + + if (delete_filter_ != nullptr) { + (*delete_filter_)(state_, filter, len); + } else { + free(filter); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return (*key_match_)(state_, key.data(), key.size(), + filter.data(), filter.size()); + } +}; + +struct rocksdb_mergeoperator_t : public MergeOperator { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*full_merge_)( + void*, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length); + char* (*partial_merge_)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + void (*delete_value_)( + void*, + const char* value, size_t value_length); + + virtual ~rocksdb_mergeoperator_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual bool FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + + size_t n = operand_list.size(); + std::vector operand_pointers(n); + std::vector operand_sizes(n); + for (size_t i = 0; i < n; i++) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + const char* existing_value_data = nullptr; + size_t existing_value_len = 0; + if (existing_value != nullptr) { + existing_value_data = existing_value->data(); + existing_value_len = existing_value->size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*full_merge_)( + state_, + key.data(), key.size(), + existing_value_data, existing_value_len, + &operand_pointers[0], &operand_sizes[0], n, + &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + size_t operand_count = operand_list.size(); + std::vector operand_pointers(operand_count); + std::vector operand_sizes(operand_count); + for (size_t i = 0; i < operand_count; ++i) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*partial_merge_)( + state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], + operand_count, &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } +}; + +struct rocksdb_env_t { + Env* rep; + bool is_default; +}; + +struct rocksdb_slicetransform_t : public SliceTransform { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*transform_)( + void*, + const char* key, size_t length, + size_t* dst_length); + unsigned char (*in_domain_)( + void*, + const char* key, size_t length); + unsigned char (*in_range_)( + void*, + const char* key, size_t length); + + virtual ~rocksdb_slicetransform_t() { + (*destructor_)(state_); + } + + virtual const char* Name() const { + return (*name_)(state_); + } + + virtual Slice Transform(const Slice& src) const { + size_t len; + char* dst = (*transform_)(state_, src.data(), src.size(), &len); + return Slice(dst, len); + } + + virtual bool InDomain(const Slice& src) const { + return (*in_domain_)(state_, src.data(), src.size()); + } + + virtual bool InRange(const Slice& src) const { + return (*in_range_)(state_, src.data(), src.size()); + } +}; + +struct rocksdb_universal_compaction_options_t { + rocksdb::CompactionOptionsUniversal *rep; +}; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != nullptr); + if (s.ok()) { + return false; + } else if (*errptr == nullptr) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +void rocksdb_close(rocksdb_t* db) { + delete db->rep; + delete db; +} + +void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + +void rocksdb_merge( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_property_value( + rocksdb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + delete[] ranges; +} + +void rocksdb_delete_file( + rocksdb_t* db, + const char* name) { + db->rep->DeleteFile(name); +} + +const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db) { + rocksdb_livefiles_t* result = new rocksdb_livefiles_t; + db->rep->GetLiveFilesMetaData(&result->rep); + return result; +} + +void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_flush( + rocksdb_t* db, + const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep)); +} + +void rocksdb_disable_file_deletions( + rocksdb_t* db, + char** errptr) { + SaveError(errptr, db->rep->DisableFileDeletions()); +} + +void rocksdb_enable_file_deletions( + rocksdb_t* db, + unsigned char force, + char** errptr) { + SaveError(errptr, db->rep->EnableFileDeletions(force)); +} + +void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void rocksdb_iter_next(rocksdb_iterator_t* iter) { + iter->rep->Next(); +} + +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { + iter->rep->Prev(); +} + +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; +} + +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { + delete b; +} + +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { + b->rep.Clear(); +} + +int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { + return b->rep.Count(); +} + +void rocksdb_writebatch_put( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_merge( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_delete( + rocksdb_writebatch_t* b, + const char* key, size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_iterate( + rocksdb_writebatch_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + virtual void Put(const Slice& key, const Slice& value) { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + virtual void Delete(const Slice& key) { + (*deleted_)(state_, key.data(), key.size()); + } + }; + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) { + *size = b->rep.GetDataSize(); + return b->rep.Data().c_str(); +} + +rocksdb_options_t* rocksdb_options_create() { + return new rocksdb_options_t; +} + +void rocksdb_options_destroy(rocksdb_options_t* options) { + delete options; +} + +void rocksdb_options_set_comparator( + rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void rocksdb_options_set_merge_operator( + rocksdb_options_t* opt, + rocksdb_mergeoperator_t* merge_operator) { + opt->rep.merge_operator = std::shared_ptr(merge_operator); +} + +void rocksdb_options_set_filter_policy( + rocksdb_options_t* opt, + rocksdb_filterpolicy_t* policy) { + opt->rep.filter_policy = policy; +} + +void rocksdb_options_set_create_if_missing( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.create_if_missing = v; +} + +void rocksdb_options_set_error_if_exists( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.error_if_exists = v; +} + +void rocksdb_options_set_paranoid_checks( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.paranoid_checks = v; +} + +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { + opt->rep.env = (env ? env->rep : nullptr); +} + +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { + if (l) { + opt->rep.info_log = l->rep; + } +} + +void rocksdb_options_set_info_log_level( + rocksdb_options_t* opt, int v) { + opt->rep.info_log_level = static_cast(v); +} + +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) { + if (c) { + opt->rep.block_cache = c->rep; + } +} + +void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) { + if (c) { + opt->rep.block_cache_compressed = c->rep; + } +} + +void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) { + opt->rep.block_size = s; +} + +void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) { + opt->rep.block_restart_interval = n; +} + +void rocksdb_options_set_target_file_size_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.target_file_size_base = n; +} + +void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.target_file_size_multiplier = n; +} + +void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.max_bytes_for_level_base = n; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.max_bytes_for_level_multiplier = n; +} + +void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t* opt, int n) { + opt->rep.max_grandparent_overlap_factor = n; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t* opt, int* level_values, size_t num_levels) { + opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i]; + } +} + +void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { + opt->rep.statistics = rocksdb::CreateDBStatistics(); +} + +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { + opt->rep.num_levels = n; +} + +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_file_num_compaction_trigger = n; +} + +void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_slowdown_writes_trigger = n; +} + +void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_stop_writes_trigger = n; +} + +void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t* opt, int n) { + opt->rep.max_mem_compaction_level = n; +} + +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, + int* level_values, + size_t num_levels) { + opt->rep.compression_per_level.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.compression_per_level[i] = + static_cast(level_values[i]); + } +} + +void rocksdb_options_set_compression_options( + rocksdb_options_t* opt, int w_bits, int level, int strategy) { + opt->rep.compression_opts.window_bits = w_bits; + opt->rep.compression_opts.level = level; + opt->rep.compression_opts.strategy = strategy; +} + +void rocksdb_options_set_prefix_extractor( + rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { + opt->rep.prefix_extractor.reset(prefix_extractor); +} + +void rocksdb_options_set_whole_key_filtering( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.whole_key_filtering = v; +} + +void rocksdb_options_set_disable_data_sync( + rocksdb_options_t* opt, int disable_data_sync) { + opt->rep.disableDataSync = disable_data_sync; +} + +void rocksdb_options_set_use_fsync( + rocksdb_options_t* opt, int use_fsync) { + opt->rep.use_fsync = use_fsync; +} + +void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t* opt, int db_stats_log_interval) { + opt->rep.db_stats_log_interval = db_stats_log_interval; +} + +void rocksdb_options_set_db_log_dir( + rocksdb_options_t* opt, const char* db_log_dir) { + opt->rep.db_log_dir = db_log_dir; +} + +void rocksdb_options_set_wal_dir( + rocksdb_options_t* opt, const char* v) { + opt->rep.wal_dir = v; +} + +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { + opt->rep.WAL_ttl_seconds = ttl; +} + +void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t* opt, uint64_t limit) { + opt->rep.WAL_size_limit_MB = limit; +} + +void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.manifest_preallocation_size = v; +} + +void rocksdb_options_set_purge_redundant_kvs_while_flush( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.purge_redundant_kvs_while_flush = v; +} + +void rocksdb_options_set_allow_os_buffer( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_os_buffer = v; +} + +void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_mmap_reads = v; +} + +void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_mmap_writes = v; +} + +void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.is_fd_close_on_exec = v; +} + +void rocksdb_options_set_skip_log_error_on_recovery( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.skip_log_error_on_recovery = v; +} + +void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t* opt, unsigned int v) { + opt->rep.stats_dump_period_sec = v; +} + +void rocksdb_options_set_block_size_deviation( + rocksdb_options_t* opt, int v) { + opt->rep.block_size_deviation = v; +} + +void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.advise_random_on_open = v; +} + +void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t* opt, int v) { + switch(v) { + case 0: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE; + break; + case 1: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL; + break; + case 2: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL; + break; + case 3: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED; + break; + } +} + +void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.use_adaptive_mutex = v; +} + +void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.bytes_per_sync = v; +} + +void rocksdb_options_set_verify_checksums_in_compaction( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.verify_checksums_in_compaction = v; +} + +void rocksdb_options_set_filter_deletes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.filter_deletes = v; +} + +void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.max_sequential_skip_in_iterations = v; +} + +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number = n; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { + opt->rep.max_background_compactions = n; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) { + opt->rep.max_log_file_size = v; +} + +void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) { + opt->rep.log_file_time_to_roll = v; +} + +void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { + opt->rep.keep_log_file_num = v; +} + +void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) { + opt->rep.soft_rate_limit = v; +} + +void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) { + opt->rep.hard_rate_limit = v; +} + +void rocksdb_options_set_rate_limit_delay_max_milliseconds( + rocksdb_options_t* opt, unsigned int v) { + opt->rep.rate_limit_delay_max_milliseconds = v; +} + +void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.max_manifest_file_size = v; +} + +void rocksdb_options_set_no_block_cache( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.no_block_cache = v; +} + +void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t* opt, int v) { + opt->rep.table_cache_numshardbits = v; +} + +void rocksdb_options_set_table_cache_remove_scan_count_limit( + rocksdb_options_t* opt, int v) { + opt->rep.table_cache_remove_scan_count_limit = v; +} + +void rocksdb_options_set_arena_block_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.arena_block_size = v; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { + opt->rep.disable_auto_compactions = disable; +} + +void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) { + opt->rep.disable_seek_compaction = disable; +} + +void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.delete_obsolete_files_period_micros = v; +} + +void rocksdb_options_set_source_compaction_factor( + rocksdb_options_t* opt, int n) { + opt->rep.expanded_compaction_factor = n; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { + static rocksdb::VectorRepFactory* factory = 0; + if (!factory) { + factory = new rocksdb::VectorRepFactory; + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_memtable_prefix_bloom_bits( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.memtable_prefix_bloom_bits = v; +} + +void rocksdb_options_set_memtable_prefix_bloom_probes( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.memtable_prefix_bloom_probes = v; +} + +void rocksdb_options_set_hash_skip_list_rep( + rocksdb_options_t *opt, size_t bucket_count, + int32_t skiplist_height, int32_t skiplist_branching_factor) { + static rocksdb::MemTableRepFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewHashSkipListRepFactory( + bucket_count, skiplist_height, skiplist_branching_factor); + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_hash_link_list_rep( + rocksdb_options_t *opt, size_t bucket_count) { + static rocksdb::MemTableRepFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewHashLinkListRepFactory(bucket_count); + } + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_plain_table_factory( + rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { + static rocksdb::TableFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewPlainTableFactory( + user_key_len, bloom_bits_per_key, + hash_table_ratio, index_sparseness); + } + opt->rep.table_factory.reset(factory); +} + +void rocksdb_options_set_max_successive_merges( + rocksdb_options_t* opt, size_t v) { + opt->rep.max_successive_merges = v; +} + +void rocksdb_options_set_min_partial_merge_operands( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.min_partial_merge_operands = v; +} + +void rocksdb_options_set_bloom_locality( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.bloom_locality = v; +} + +void rocksdb_options_set_allow_thread_local( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_thread_local = v; +} + +void rocksdb_options_set_inplace_update_support( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.inplace_update_support = v; +} + +void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t* opt, size_t v) { + opt->rep.inplace_update_num_locks = v; +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { + opt->rep.compaction_style = static_cast(style); +} + +void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +/* +TODO: +DB::OpenForReadOnly +DB::MultiGet +DB::KeyMayExist +DB::GetOptions +DB::GetSortedWalFiles +DB::GetLatestSequenceNumber +DB::GetUpdatesSince +DB::GetDbIdentity +DB::RunManualCompaction +custom cache +compaction_filter +table_properties_collectors +*/ + +rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)) { + rocksdb_comparator_t* result = new rocksdb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + return result; +} + +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { + delete cmp; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + void (*delete_filter)( + void*, + const char* filter, size_t filter_length), + const char* (*name)(void*)) { + rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_ = create_filter; + result->key_match_ = key_may_match; + result->delete_filter_ = delete_filter; + result->name_ = name; + return result; +} + +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { + delete filter; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() { delete rep_; } + const char* Name() const { return rep_->Name(); } + void CreateFilter(const Slice* keys, int n, std::string* dst) const { + return rep_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const { + return rep_->KeyMayMatch(key, filter); + } + static void DoNothing(void*) { } + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key); + wrapper->state_ = nullptr; + wrapper->delete_filter_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)) { + rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t; + result->state_ = state; + result->destructor_ = destructor; + result->full_merge_ = full_merge; + result->partial_merge_ = partial_merge; + result->delete_value_ = delete_value; + result->name_ = name; + return result; +} + +void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) { + delete merge_operator; +} + +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; +} + +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { + delete opt; +} + +void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.fill_cache = v; +} + +void rocksdb_readoptions_set_prefix_seek( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.prefix_seek = v; +} + +void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : nullptr); +} + +void rocksdb_readoptions_set_prefix( + rocksdb_readoptions_t* opt, const char* key, size_t keylen) { + Slice prefix = Slice(key, keylen); + opt->rep.prefix = &prefix; +} + +void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t* opt, int v) { + opt->rep.read_tier = static_cast(v); +} + +void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.tailing = v; +} + +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; +} + +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { + delete opt; +} + +void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.sync = v; +} + +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { + opt->rep.disableWAL = disable; +} + + +rocksdb_flushoptions_t* rocksdb_flushoptions_create() { + return new rocksdb_flushoptions_t; +} + +void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { + delete opt; +} + +void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t* opt, unsigned char v) { + opt->rep.wait = v; +} + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { + delete cache; +} + +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, + void (*destructor)(void*), + char* (*transform)( + void*, + const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)( + void*, + const char* key, size_t length), + unsigned char (*in_range)( + void*, + const char* key, size_t length), + const char* (*name)(void*)) { + rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; + result->state_ = state; + result->destructor_ = destructor; + result->transform_ = transform; + result->in_domain_ = in_domain; + result->in_range_ = in_range; + result->name_ = name; + return result; +} + +void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { + delete st; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) { + struct Wrapper : public rocksdb_slicetransform_t { + const SliceTransform* rep_; + ~Wrapper() { delete rep_; } + const char* Name() const { return rep_->Name(); } + Slice Transform(const Slice& src) const { + return rep_->Transform(src); + } + bool InDomain(const Slice& src) const { + return rep_->InDomain(src); + } + bool InRange(const Slice& src) const { + return rep_->InRange(src); + } + static void DoNothing(void*) { } + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = rocksdb::NewFixedPrefixTransform(prefixLen); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; + result->rep = new rocksdb::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = static_cast(style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + +void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) { + if (level >= 0) { + assert(level <= opt->rep.num_levels); + opt->rep.compression_per_level.resize(opt->rep.num_levels); + for (int i = 0; i < level; i++) { + opt->rep.compression_per_level[i] = rocksdb::kNoCompression; + } + for (int i = level; i < opt->rep.num_levels; i++) { + opt->rep.compression_per_level[i] = opt->rep.compression; + } + } +} + +int rocksdb_livefiles_count( + const rocksdb_livefiles_t* lf) { + return lf->rep.size(); +} + +const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].name.c_str(); +} + +int rocksdb_livefiles_level( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].level; +} + +size_t rocksdb_livefiles_size( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].size; +} + +const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t* lf, + int index, + size_t* size) { + *size = lf->rep[index].smallestkey.size(); + return lf->rep[index].smallestkey.data(); +} + +const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t* lf, + int index, + size_t* size) { + *size = lf->rep[index].largestkey.size(); + return lf->rep[index].largestkey.data(); +} + +extern void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t* lf) { + delete lf; +} + +} // end extern "C" diff --git a/db/c_test.c b/db/c_test.c new file mode 100644 index 00000000..e6c5a9e6 --- /dev/null +++ b/db/c_test.c @@ -0,0 +1,496 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ + +#include "rocksdb/c.h" + +#include +#include +#include +#include +#include +#include + +const char* phase = ""; +static char dbname[200]; + +static void StartPhase(const char* name) { + fprintf(stderr, "=== Test %s\n", name); + phase = name; +} + +static const char* GetTempDir(void) { + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') + ret = "/tmp"; + return ret; +} + +#define CheckNoError(err) \ + if ((err) != NULL) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ + abort(); \ + } + +#define CheckCondition(cond) \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ + abort(); \ + } + +static void CheckEqual(const char* expected, const char* v, size_t n) { + if (expected == NULL && v == NULL) { + // ok + } else if (expected != NULL && v != NULL && n == strlen(expected) && + memcmp(expected, v, n) == 0) { + // ok + return; + } else { + fprintf(stderr, "%s: expected '%s', got '%s'\n", + phase, + (expected ? expected : "(null)"), + (v ? v : "(null")); + abort(); + } +} + +static void Free(char** ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } +} + +static void CheckGet( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckIter(rocksdb_iterator_t* iter, + const char* key, const char* val) { + size_t len; + const char* str; + str = rocksdb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = rocksdb_iter_value(iter, &len); + CheckEqual(val, str, len); +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckPut(void* ptr, + const char* k, size_t klen, + const char* v, size_t vlen) { + int* state = (int*) ptr; + CheckCondition(*state < 2); + switch (*state) { + case 0: + CheckEqual("bar", k, klen); + CheckEqual("b", v, vlen); + break; + case 1: + CheckEqual("box", k, klen); + CheckEqual("c", v, vlen); + break; + } + (*state)++; +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckDel(void* ptr, const char* k, size_t klen) { + int* state = (int*) ptr; + CheckCondition(*state == 2); + CheckEqual("bar", k, klen); + (*state)++; +} + +static void CmpDestroy(void* arg) { } + +static int CmpCompare(void* arg, const char* a, size_t alen, + const char* b, size_t blen) { + int n = (alen < blen) ? alen : blen; + int r = memcmp(a, b, n); + if (r == 0) { + if (alen < blen) r = -1; + else if (alen > blen) r = +1; + } + return r; +} + +static const char* CmpName(void* arg) { + return "foo"; +} + +// Custom filter policy +static unsigned char fake_filter_result = 1; +static void FilterDestroy(void* arg) { } +static const char* FilterName(void* arg) { + return "TestFilter"; +} +static char* FilterCreate( + void* arg, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length) { + *filter_length = 4; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +unsigned char FilterKeyMatch( + void* arg, + const char* key, size_t length, + const char* filter, size_t filter_length) { + CheckCondition(filter_length == 4); + CheckCondition(memcmp(filter, "fake", 4) == 0); + return fake_filter_result; +} + +// Custom merge operator +static void MergeOperatorDestroy(void* arg) { } +static const char* MergeOperatorName(void* arg) { + return "TestMergeOperator"; +} +static char* MergeOperatorFullMerge( + void* arg, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length) { + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +static char* MergeOperatorPartialMerge( + void* arg, + const char* key, size_t key_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length) { + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} + +int main(int argc, char** argv) { + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; + char* err = NULL; + int run = -1; + + snprintf(dbname, sizeof(dbname), + "%s/rocksdb_c_test-%d", + GetTempDir(), + ((int) geteuid())); + + StartPhase("create_objects"); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + env = rocksdb_create_default_env(); + cache = rocksdb_cache_create_lru(100000); + + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_cache(options, cache); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + rocksdb_options_set_block_size(options, 1024); + rocksdb_options_set_block_restart_interval(options, 8); + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); + + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 0); + + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); + + StartPhase("destroy"); + rocksdb_destroy_db(options, dbname, &err); + Free(&err); + + StartPhase("open_error"); + db = rocksdb_open(options, dbname, &err); + CheckCondition(err != NULL); + Free(&err); + + StartPhase("open"); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + + StartPhase("put"); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactall"); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrange"); + rocksdb_compact_range(db, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("writebatch"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("iter"); + { + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "box", "c"); + rocksdb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_prev(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + } + + StartPhase("approximate_sizes"); + { + int i; + int n = 20000; + char keybuf[100]; + char valbuf[100]; + uint64_t sizes[2]; + const char* start[2] = { "a", "k00000000000000010000" }; + size_t start_len[2] = { 1, 21 }; + const char* limit[2] = { "k00000000000000010000", "z" }; + size_t limit_len[2] = { 21, 1 }; + rocksdb_writeoptions_set_sync(woptions, 0); + for (i = 0; i < n; i++) { + snprintf(keybuf, sizeof(keybuf), "k%020d", i); + snprintf(valbuf, sizeof(valbuf), "v%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + &err); + CheckNoError(err); + } + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + CheckCondition(sizes[0] > 0); + CheckCondition(sizes[1] > 0); + } + + StartPhase("property"); + { + char* prop = rocksdb_property_value(db, "nosuchprop"); + CheckCondition(prop == NULL); + prop = rocksdb_property_value(db, "rocksdb.stats"); + CheckCondition(prop != NULL); + Free(&prop); + } + + StartPhase("snapshot"); + { + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + CheckGet(db, roptions, "foo", NULL); + rocksdb_release_snapshot(db, snap); + } + + StartPhase("repair"); + { + // If we do not compact here, then the lazy deletion of + // files (https://reviews.facebook.net/D6123) would leave + // around deleted files and the repair process will find + // those files and put them back into the database. + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_repair_db(options, dbname, &err); + CheckNoError(err); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("filter"); + for (run = 0; run < 2; run++) { + // First run uses custom filter, second run uses bloom filter + CheckNoError(err); + rocksdb_filterpolicy_t* policy; + if (run == 0) { + policy = rocksdb_filterpolicy_create( + NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName); + } else { + policy = rocksdb_filterpolicy_create_bloom(10); + } + + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_filter_policy(options, policy); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + + fake_filter_result = 1; + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + if (phase == 0) { + // Must not find value when custom filter returns false + fake_filter_result = 0; + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + fake_filter_result = 1; + + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + } + rocksdb_options_set_filter_policy(options, NULL); + rocksdb_filterpolicy_destroy(policy); + } + + StartPhase("merge_operator"); + { + rocksdb_mergeoperator_t* merge_operator; + merge_operator = rocksdb_mergeoperator_create( + NULL, MergeOperatorDestroy, MergeOperatorFullMerge, + MergeOperatorPartialMerge, NULL, MergeOperatorName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_merge_operator(options, merge_operator); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "foovalue"); + rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "fake"); + + // Merge of a non-existing value + rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", "fake"); + + } + + StartPhase("prefix"); + { + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10); + rocksdb_options_set_filter_policy(options, policy); + rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); + rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4); + rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err); + CheckNoError(err); + + rocksdb_readoptions_set_prefix_seek(roptions, 1); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_seek(iter, "bar", 3); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + CheckCondition(rocksdb_iter_valid(iter)); + + CheckIter(iter, "bar1", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar2", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar3", "bar"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + rocksdb_filterpolicy_destroy(policy); + } + + StartPhase("cleanup"); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_env_destroy(env); + + fprintf(stderr, "PASS\n"); + return 0; +} diff --git a/db/compaction.cc b/db/compaction.cc new file mode 100644 index 00000000..8bb4f9c6 --- /dev/null +++ b/db/compaction.cc @@ -0,0 +1,245 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Compaction::Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, + bool seek_compaction, bool enable_compression) + : level_(level), + out_level_(out_level), + max_output_file_size_(target_file_size), + max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes), + input_version_(input_version), + number_levels_(input_version_->NumberLevels()), + seek_compaction_(seek_compaction), + enable_compression_(enable_compression), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + is_manual_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + + input_version_->Ref(); + edit_ = new VersionEdit(); + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + delete edit_; + if (input_version_ != nullptr) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If level_== out_level_, the purpose is to force compaction filter to be + // applied to that level, and thus cannot be a trivia move. + return (level_ != out_level_ && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + return bottommost_level_; + } + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const Slice& internal_key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + assert(grandparent_index_ + 1 >= grandparents_.size() || + icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), + grandparents_[grandparent_index_+1]->smallest.Encode()) + < 0); + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > max_grandparent_overlap_bytes_) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool value) { + for (int i = 0; i < 2; i++) { + std::vector v = inputs_[i]; + for (unsigned int j = 0; j < inputs_[i].size(); j++) { + assert(value ? !inputs_[i][j]->being_compacted : + inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = value; + } + } +} + +// Is this compaction producing files at the bottommost level? +void Compaction::SetupBottomMostLevel(bool isManual) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + // If universal compaction style is used and manual + // compaction is occuring, then we are guaranteed that + // all files will be picked in a single compaction + // run. We can safely set bottommost_level_ = true. + // If it is not manual compaction, then bottommost_level_ + // is already set when the Compaction was created. + if (isManual) { + bottommost_level_ = true; + } + return; + } + bottommost_level_ = true; + int num_levels = input_version_->vset_->NumberLevels(); + for (int i = output_level() + 1; i < num_levels; i++) { + if (input_version_->NumLevelFiles(i) > 0) { + bottommost_level_ = false; + break; + } + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != nullptr) { + input_version_->Unref(); + input_version_ = nullptr; + } +} + +void Compaction::ResetNextCompactionIndex() { + input_version_->ResetNextCompactionIndex(level_); +} + +/* +for sizes >=10TB, print "XXTB" +for sizes >=10GB, print "XXGB" +etc. +*/ +static void FileSizeSummary(unsigned long long sz, char* output, int len) { + const unsigned long long ull10 = 10; + if (sz >= ull10<<40) { + snprintf(output, len, "%lluTB", sz>>40); + } else if (sz >= ull10<<30) { + snprintf(output, len, "%lluGB", sz>>30); + } else if (sz >= ull10<<20) { + snprintf(output, len, "%lluMB", sz>>20); + } else if (sz >= ull10<<10) { + snprintf(output, len, "%lluKB", sz>>10); + } else { + snprintf(output, len, "%lluB", sz); + } +} + +static int InputSummary(std::vector& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (unsigned int i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + FileSizeSummary((unsigned long long)files.at(i)->file_size, sztxt, 16); + ret = snprintf(output + write, sz, "%lu(%s) ", + (unsigned long)files.at(i)->number, + sztxt); + if (ret < 0 || ret >= sz) + break; + write += ret; + } + return write; +} + +void Compaction::Summary(char* output, int len) { + int write = snprintf(output, len, + "Base version %lu Base level %d, seek compaction:%d, inputs: [", + (unsigned long)input_version_->GetVersionNumber(), + level_, + seek_compaction_); + if (write < 0 || write >= len) { + return; + } + + write += InputSummary(inputs_[0], output+write, len-write); + if (write < 0 || write >= len) { + return; + } + + write += snprintf(output+write, len-write, "],["); + if (write < 0 || write >= len) { + return; + } + + write += InputSummary(inputs_[1], output+write, len-write); + if (write < 0 || write >= len) { + return; + } + + snprintf(output+write, len-write, "]"); +} + +} // namespace rocksdb diff --git a/db/compaction.h b/db/compaction.h new file mode 100644 index 00000000..ead1d87a --- /dev/null +++ b/db/compaction.h @@ -0,0 +1,143 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" + +namespace rocksdb { + +class Version; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // will be merged. + int level() const { return level_; } + + // Outputs will go to this level + int output_level() const { return out_level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + std::vector* inputs(int which) { return &inputs_[which]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Whether compression will be enabled for compaction outputs + bool enable_compression() const { return enable_compression_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool BottomMostLevel() { return bottommost_level_; } + + // Does this compaction include all sst files? + bool IsFullCompaction() { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool IsManualCompaction() { return is_manual_compaction_; } + + private: + friend class CompactionPicker; + friend class UniversalCompactionPicker; + friend class LevelCompactionPicker; + + Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, + bool seek_compaction = false, bool enable_compression = true); + + int level_; + int out_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t max_grandparent_overlap_bytes_; + Version* input_version_; + VersionEdit* edit_; + int number_levels_; + + bool seek_compaction_; + bool enable_compression_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + uint64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + int base_index_; // index of the file in files_[level_] + int parent_index_; // index of some file with same range in files_[level_+1] + double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + // Does this compaction include all sst files? + bool is_full_compaction_; + + // Is this compaction requested by the client? + bool is_manual_compaction_; + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + std::vector level_ptrs_; + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool); + + // Initialize whether compaction producing files at the bottommost level + void SetupBottomMostLevel(bool isManual); + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); +}; + +} // namespace rocksdb diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc new file mode 100644 index 00000000..db9cacfc --- /dev/null +++ b/db/compaction_picker.cc @@ -0,0 +1,882 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction_picker.h" + +#include +#include "util/log_buffer.h" +#include "util/statistics.h" + +namespace rocksdb { + +namespace { + +uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +// Multiple two operands. If they overflow, return op1. +uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) { + if (op1 == 0) { + return 0; + } + if (op2 <= 0) { + return op1; + } + uint64_t casted_op2 = (uint64_t) op2; + if (std::numeric_limits::max() / op1 < casted_op2) { + return op1; + } + return op1 * casted_op2; +} + +} // anonymous namespace + +CompactionPicker::CompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : compactions_in_progress_(options->num_levels), + options_(options), + num_levels_(options->num_levels), + icmp_(icmp) { + + max_file_size_.reset(new uint64_t[NumberLevels()]); + level_max_bytes_.reset(new uint64_t[NumberLevels()]); + int target_file_size_multiplier = options_->target_file_size_multiplier; + int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; + for (int i = 0; i < NumberLevels(); i++) { + if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { + max_file_size_[i] = ULLONG_MAX; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } else if (i > 1) { + max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1], + target_file_size_multiplier); + level_max_bytes_[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier), + options_->max_bytes_for_level_multiplier_additional[i - 1]); + } else { + max_file_size_[i] = options_->target_file_size_base; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } + } +} + +CompactionPicker::~CompactionPicker() {} + +void CompactionPicker::SizeBeingCompacted(std::vector& sizes) { + for (int level = 0; level < NumberLevels() - 1; level++) { + uint64_t total = 0; + for (auto c : compactions_in_progress_[level]) { + assert(c->level() == level); + for (int i = 0; i < c->num_input_files(0); i++) { + total += c->input(0,i)->file_size; + } + } + sizes[level] = total; + } +} + +// Clear all files to indicate that they are not being compacted +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + c->MarkFilesBeingCompacted(false); + compactions_in_progress_[c->level()].erase(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return max_file_size_[level]; +} + +uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->max_grandparent_overlap_factor; + return result; +} + +double CompactionPicker::MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < NumberLevels()); + return level_max_bytes_[level]; +} + +void CompactionPicker::GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +void CompactionPicker::GetRange(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { + // If inputs are empty then there is nothing to expand. + if (!c || c->inputs_[0].empty()) { + return true; + } + + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (c->level() == 0) { + return true; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Keep expanding c->inputs_[0] until we are sure that there is a + // "clean cut" boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = c->inputs_[0].size(); + GetRange(c->inputs_[0], &smallest, &largest); + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs( + level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index); + } while(c->inputs_[0].size() > old_size); + + // Get the new range + GetRange(c->inputs_[0], &smallest, &largest); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + int parent_index = -1; + if (c->inputs_[0].empty()) { + Log(options_->info_log, + "ExpandWhileOverlapping() failure because zero input files"); + } + if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0]) || + (c->level() != c->output_level() && + ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + &parent_index))) { + c->inputs_[0].clear(); + c->inputs_[1].clear(); + return false; + } + return true; +} + +uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->expanded_compaction_factor; + return result; +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::FilesInCompaction(std::vector& files) { + for (unsigned int i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::ParentRangeInCompaction(Version* version, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* parent_index) { + std::vector inputs; + assert(level + 1 < NumberLevels()); + + version->GetOverlappingInputs(level + 1, smallest, largest, &inputs, + *parent_index, parent_index); + return FilesInCompaction(inputs); +} + +// Populates the set of inputs from "level+1" that overlap with "level". +// Will also attempt to expand "level" if that doesn't expand "level+1" +// or cause "level" to include a file for compaction that has an overlapping +// user-key with another file. +void CompactionPicker::SetupOtherInputs(Compaction* c) { + // If inputs are empty, then there is nothing to expand. + // If both input and output levels are the same, no need to consider + // files at level "level+1" + if (c->inputs_[0].empty() || c->level() == c->output_level()) { + return; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(c->inputs_[0], &smallest, &largest); + + // Populate the set of next-level files (inputs_[1]) to include in compaction + c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest, + &c->inputs_[1], c->parent_index_, + &c->parent_index_); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + c->input_version_->GetOverlappingInputs( + level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); + const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]); + const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]); + const uint64_t expanded0_size = TotalFileSize(expanded0); + uint64_t limit = ExpandedCompactionByteSizeLimit(level); + if (expanded0.size() > c->inputs_[0].size() && + inputs1_size + expanded0_size < limit && + !FilesInCompaction(expanded0) && + !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit, + &expanded1, c->parent_index_, + &c->parent_index_); + if (expanded1.size() == c->inputs_[1].size() && + !FilesInCompaction(expanded1)) { + Log(options_->info_log, + "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" + "\n", + (unsigned long)level, + (unsigned long)(c->inputs_[0].size()), + (unsigned long)(c->inputs_[1].size()), + (unsigned long)inputs0_size, + (unsigned long)inputs1_size, + (unsigned long)(expanded0.size()), + (unsigned long)(expanded1.size()), + (unsigned long)expanded0_size, + (unsigned long)inputs1_size); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < NumberLevels()) { + c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); + } +} + + +Compaction* CompactionPicker::CompactRange(Version* version, int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + std::vector inputs; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (options_->compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + version->GetOverlappingInputs(input_level, begin, end, &inputs); + if (inputs.empty()) { + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = + MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; + uint64_t total = 0; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + **compaction_end = inputs[i + 1]->smallest; + covering_the_whole_range = false; + inputs.resize(i + 1); + break; + } + } + } + Compaction* c = new Compaction(version, input_level, output_level, + MaxFileSizeForLevel(output_level), + MaxGrandParentOverlapBytes(input_level)); + + c->inputs_[0] = inputs; + if (ExpandWhileOverlapping(c) == false) { + delete c; + Log(options_->info_log, "Could not compact due to expansion failure.\n"); + return nullptr; + } + + SetupOtherInputs(c); + + if (covering_the_whole_range) { + *compaction_end = nullptr; + } + + // These files that are to be manaully compacted do not trample + // upon other files because manual compactions are processed when + // the system has a max of 1 background compaction thread. + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(true); + + c->is_manual_compaction_ = true; + + return c; +} + +Compaction* LevelCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { + Compaction* c = nullptr; + int level = -1; + + // Compute the compactions needed. It is better to do it here + // and also in LogAndApply(), otherwise the values could be stale. + std::vector size_being_compacted(NumberLevels() - 1); + SizeBeingCompacted(size_being_compacted); + version->ComputeCompactionScore(size_being_compacted); + + // We prefer compactions triggered by too much data in a level over + // the compactions triggered by seeks. + // + // Find the compactions by size on all levels. + for (int i = 0; i < NumberLevels() - 1; i++) { + assert(i == 0 || + version->compaction_score_[i] <= version->compaction_score_[i - 1]); + level = version->compaction_level_[i]; + if ((version->compaction_score_[i] >= 1)) { + c = PickCompactionBySize(version, level, version->compaction_score_[i]); + if (ExpandWhileOverlapping(c) == false) { + delete c; + c = nullptr; + } else { + break; + } + } + } + + // Find compactions needed by seeks + FileMetaData* f = version->file_to_compact_; + if (c == nullptr && f != nullptr && !f->being_compacted) { + + level = version->file_to_compact_level_; + int parent_index = -1; + + // Only allow one level 0 compaction at a time. + // Do not pick this file if its parents at level+1 are being compacted. + if (level != 0 || compactions_in_progress_[0].empty()) { + if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level, + &parent_index)) { + c = new Compaction(version, level, level + 1, + MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level), true); + c->inputs_[0].push_back(f); + c->parent_index_ = parent_index; + c->input_version_->file_to_compact_ = nullptr; + if (ExpandWhileOverlapping(c) == false) { + return nullptr; + } + } + } + } + + if (c == nullptr) { + return nullptr; + } + + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + if (level == 0) { + assert(compactions_in_progress_[0].empty()); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs(0, &smallest, &largest, + &c->inputs_[0]); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(c->inputs_[0], &smallest, &largest); + if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + &c->parent_index_)) { + delete c; + return nullptr; + } + assert(!c->inputs_[0].empty()); + } + + // Setup "level+1" files (inputs_[1]) + SetupOtherInputs(c); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(false); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + return c; +} + +Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, + int level, + double score) { + Compaction* c = nullptr; + + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (level == 0 && compactions_in_progress_[level].size() == 1) { + return nullptr; + } + + assert(level >= 0); + assert(level + 1 < NumberLevels()); + c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level)); + c->score_ = score; + + // Pick the largest file in this level that is not already + // being compacted + std::vector& file_size = c->input_version_->files_by_size_[level]; + + // record the first file that is not yet compacted + int nextIndex = -1; + + for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level]; + i < file_size.size(); i++) { + int index = file_size[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + + // check to verify files are arranged in descending size + assert((i == file_size.size() - 1) || + (i >= Version::number_of_files_to_sort_ - 1) || + (f->file_size >= + c->input_version_->files_[level][file_size[i + 1]]->file_size)); + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + // remember the startIndex for the next call to PickCompaction + if (nextIndex == -1) { + nextIndex = i; + } + + //if (i > Version::number_of_files_to_sort_) { + // Log(options_->info_log, "XXX Looking at index %d", i); + //} + + // Do not pick this file if its parents at level+1 are being compacted. + // Maybe we can avoid redoing this work in SetupOtherInputs + int parent_index = -1; + if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest, + level, &parent_index)) { + continue; + } + c->inputs_[0].push_back(f); + c->base_index_ = index; + c->parent_index_ = parent_index; + break; + } + + if (c->inputs_[0].empty()) { + delete c; + c = nullptr; + } + + // store where to start the iteration in the next call to PickCompaction + version->next_file_to_compact_by_size_[level] = nextIndex; + + return c; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +// +Compaction* UniversalCompactionPicker::PickCompaction(Version* version, + LogBuffer* log_buffer) { + int level = 0; + double score = version->compaction_score_[0]; + + if ((version->files_[level].size() < + (unsigned int)options_->level0_file_num_compaction_trigger)) { + LogToBuffer(log_buffer, "Universal: nothing to do\n"); + return nullptr; + } + Version::FileSummaryStorage tmp; + LogToBuffer(log_buffer, "Universal: candidate files(%zu): %s\n", + version->files_[level].size(), + version->LevelFileSummary(&tmp, 0)); + + // Check for size amplification first. + Compaction* c; + if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) != + nullptr) { + LogToBuffer(log_buffer, "Universal: compacting for size amp\n"); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = options_->compaction_options_universal.size_ratio; + + if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX, + log_buffer)) != nullptr) { + LogToBuffer(log_buffer, "Universal: compacting for size ratio\n"); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + unsigned int num_files = version->files_[level].size() - + options_->level0_file_num_compaction_trigger; + if ((c = PickCompactionUniversalReadAmp( + version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { + LogToBuffer(log_buffer, "Universal: compacting for file num\n"); + } + } + } + if (c == nullptr) { + return nullptr; + } + assert(c->inputs_[0].size() > 1); + + // validate that all the chosen files are non overlapping in time + FileMetaData* newerfile __attribute__((unused)) = nullptr; + for (unsigned int i = 0; i < c->inputs_[0].size(); i++) { + FileMetaData* f = c->inputs_[0][i]; + assert (f->smallest_seqno <= f->largest_seqno); + assert(newerfile == nullptr || + newerfile->smallest_seqno > f->largest_seqno); + newerfile = f; + } + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = c->input_version_->files_by_size_[level]; + + // Is the earliest file part of this compaction? + int last_index = file_by_time[file_by_time.size()-1]; + FileMetaData* last_file = c->input_version_->files_[level][last_index]; + if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) { + c->bottommost_level_ = true; + } + + // update statistics + MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs_[0].size()); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + // Record whether this compaction includes all sst files. + // For now, it is only relevant in universal compaction mode. + c->is_full_compaction_ = + (c->inputs_[0].size() == c->input_version_->files_[0].size()); + + return c; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( + Version* version, double score, unsigned int ratio, + unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) { + int level = 0; + + unsigned int min_merge_width = + options_->compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + options_->compaction_options_universal.max_merge_width; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = version->files_by_size_[level]; + FileMetaData* f = nullptr; + bool done = false; + int start_index = 0; + unsigned int candidate_count = 0; + assert(file_by_time.size() == version->files_[level].size()); + + unsigned int max_files_to_compact = std::min(max_merge_width, + max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (unsigned int loop = 0; loop < file_by_time.size(); loop++) { + + candidate_count = 0; + + // Skip files that are already being compacted + for (f = nullptr; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + + if (!f->being_compacted) { + candidate_count = 1; + break; + } + LogToBuffer(log_buffer, + "Universal: file %lu[%d] being compacted, skipping", + (unsigned long)f->number, loop); + f = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = f != nullptr? f->file_size : 0; + if (f != nullptr) { + LogToBuffer(log_buffer, "Universal: Possible candidate file %lu[%d].", + (unsigned long)f->number, loop); + } + + // Check if the suceeding files need compaction. + for (unsigned int i = loop+1; + candidate_count < max_files_to_compact && i < file_by_time.size(); + i++) { + int index = file_by_time[i]; + FileMetaData* f = version->files_[level][index]; + if (f->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + uint64_t sz = (candidate_size * (100L + ratio)) /100; + if (sz < f->file_size) { + break; + } + if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (f->file_size * (100L + ratio)) / 100; + if (sz < candidate_size) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = f->file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += f->file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (unsigned int i = loop; + i < loop + candidate_count && i < file_by_time.size(); i++) { + int index = file_by_time[i]; + FileMetaData* f = version->files_[level][index]; + LogToBuffer(log_buffer, + "Universal: Skipping file %lu[%d] with size %lu %d\n", + (unsigned long)f->number, i, (unsigned long)f->file_size, + f->being_compacted); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + unsigned int first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + options_->compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = version->NumLevelBytes(level); + uint64_t older_file_size = 0; + for (unsigned int i = file_by_time.size() - 1; i >= first_index_after; + i--) { + older_file_size += version->files_[level][file_by_time[i]]->file_size; + if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { + enable_compression = false; + break; + } + } + } + Compaction* c = + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, enable_compression); + c->score_ = score; + + for (unsigned int i = start_index; i < first_index_after; i++) { + int index = file_by_time[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + LogToBuffer(log_buffer, "Universal: Picking file %lu[%d] with size %lu\n", + (unsigned long)f->number, i, (unsigned long)f->file_size); + } + return c; +} + +// Look at overall size amplification. If size amplification +// exceeeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( + Version* version, double score, LogBuffer* log_buffer) { + int level = 0; + + // percentage flexibilty while reducing size amplification + uint64_t ratio = options_->compaction_options_universal. + max_size_amplification_percent; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = version->files_by_size_[level]; + assert(file_by_time.size() == version->files_[level].size()); + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + unsigned int start_index = 0; + FileMetaData* f = nullptr; + + // Skip files that are already being compacted + for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + if (!f->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + LogToBuffer(log_buffer, "Universal: skipping file %lu[%d] compacted %s", + (unsigned long)f->number, loop, + " cannot be a candidate to reduce size amp.\n"); + f = nullptr; + } + if (f == nullptr) { + return nullptr; // no candidate files + } + + LogToBuffer(log_buffer, "Universal: First candidate file %lu[%d] %s", + (unsigned long)f->number, start_index, " to reduce size amp.\n"); + + // keep adding up all the remaining files + for (unsigned int loop = start_index; loop < file_by_time.size() - 1; + loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + if (f->being_compacted) { + LogToBuffer( + log_buffer, "Universal: Possible candidate file %lu[%d] %s.", + (unsigned long)f->number, loop, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += f->file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size of earliest file + int index = file_by_time[file_by_time.size() - 1]; + uint64_t earliest_file_size = version->files_[level][index]->file_size; + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * earliest_file_size) { + LogToBuffer(log_buffer, + "Universal: size amp not needed. newer-files-total-size %lu " + "earliest-file-size %lu", + (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + return nullptr; + } else { + LogToBuffer(log_buffer, + "Universal: size amp needed. newer-files-total-size %lu " + "earliest-file-size %lu", + (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + } + assert(start_index >= 0 && start_index < file_by_time.size() - 1); + + // create a compaction request + // We always compact all the files, so always compress. + Compaction* c = + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, true); + c->score_ = score; + for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + LogToBuffer(log_buffer, + "Universal: size amp picking file %lu[%d] with size %lu", + (unsigned long)f->number, index, (unsigned long)f->file_size); + } + return c; +} + +} // namespace rocksdb diff --git a/db/compaction_picker.h b/db/compaction_picker.h new file mode 100644 index 00000000..4793b1ba --- /dev/null +++ b/db/compaction_picker.h @@ -0,0 +1,163 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "db/compaction.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" + +#include +#include +#include + +namespace rocksdb { + +class LogBuffer; +class Compaction; +class Version; + +class CompactionPicker { + public: + CompactionPicker(const Options* options, const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + Compaction* CompactRange(Version* version, int input_level, int output_level, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end); + + // Free up the files that participated in a compaction + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Return the total amount of data that is undergoing + // compactions per level + void SizeBeingCompacted(std::vector& sizes); + + // Returns maximum total overlap bytes with grandparent + // level (i.e., level+2) before we stop building a single + // file in level->level+1 compaction. + uint64_t MaxGrandParentOverlapBytes(int level); + + // Returns maximum total bytes of data on a given level. + double MaxBytesForLevel(int level); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level) const; + + protected: + int NumberLevels() const { return num_levels_; } + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const std::vector& inputs, InternalKey* smallest, + InternalKey* largest); + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, InternalKey* largest); + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandWhileOverlapping(Compaction* c); + + uint64_t ExpandedCompactionByteSizeLimit(int level); + + // Returns true if any one of the specified files are being compacted + bool FilesInCompaction(std::vector& files); + + // Returns true if any one of the parent files are being compacted + bool ParentRangeInCompaction(Version* version, const InternalKey* smallest, + const InternalKey* largest, int level, + int* index); + + void SetupOtherInputs(Compaction* c); + + // record all the ongoing compactions for all levels + std::vector> compactions_in_progress_; + + // Per-level target file size. + std::unique_ptr max_file_size_; + + // Per-level max bytes + std::unique_ptr level_max_bytes_; + + const Options* const options_; + private: + int num_levels_; + + const InternalKeyComparator* const icmp_; +}; + +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; + + private: + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionUniversalReadAmp(Version* version, double score, + unsigned int ratio, + unsigned int num_files, + LogBuffer* log_buffer); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionUniversalSizeAmp(Version* version, double score, + LogBuffer* log_buffer); +}; + +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version, + LogBuffer* log_buffer) override; + + private: + // For the specfied level, pick a compaction. + // Returns nullptr if there is no compaction to be done. + // If level is 0 and there is already a compaction on that level, this + // function will return nullptr. + Compaction* PickCompactionBySize(Version* version, int level, double score); +}; + +} // namespace rocksdb diff --git a/db/corruption_test.cc b/db/corruption_test.cc new file mode 100644 index 00000000..f0397b6f --- /dev/null +++ b/db/corruption_test.cc @@ -0,0 +1,416 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static const int kValueSize = 1000; + +class CorruptionTest { + public: + test::ErrorEnv env_; + std::string dbname_; + shared_ptr tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() { + tiny_cache_ = NewLRUCache(100); + options_.env = &env_; + dbname_ = test::TmpDir() + "/db_test"; + DestroyDB(dbname_, options_); + + db_ = nullptr; + options_.create_if_missing = true; + options_.block_size_deviation = 0; // make unit test pass for now + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() { + delete db_; + DestroyDB(dbname_, Options()); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opt = (options ? *options : options_); + opt.env = &env_; + opt.block_cache = tiny_cache_; + opt.block_size_deviation = 0; + opt.arena_block_size = 4096; + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = nullptr; + ASSERT_OK(::rocksdb::RepairDB(dbname_, options_)); + } + + void Build(int n) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + unsigned int next_expected = 0; + int missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + // Do not verify checksums. If we verify checksums then the + // db itself will raise errors because data is corrupted. + // Instead, we want the reads to be successful and this test + // will detect whether the appropriate corruptions have + // occured. + Iterator* iter = db_->NewIterator(ReadOptions(false, true)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(key, &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", + min_expected, max_expected, correct, bad_keys, bad_values, missed); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + int picked_number = -1; + for (unsigned int i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type == filetype && + int(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + ASSERT_TRUE(false) << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = sbuf.st_size + offset; + } + } + if (offset > sbuf.st_size) { + offset = sbuf.st_size; + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = sbuf.st_size - offset; + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } +}; + +TEST(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Reopen(); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = 3 + (Options().write_buffer_size / kValueSize); + std::string value_storage; + Status s; + bool failed = false; + for (int i = 0; i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + if (!s.ok()) { + failed = true; + } + ASSERT_TRUE(!failed || !s.ok()); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); +} + +TEST(CorruptionTest, TableFileIndexData) { + Build(10000); // Enough to build multiple Tables + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + + Corrupt(kTableFile, -2000, 500); + Reopen(); + Check(5000, 9999); +} + +TEST(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST(CorruptionTest, CompactionInputError) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + const int last = dbi->MaxMemCompactionLevel(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last))); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); +} + +TEST(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 1048576; + Reopen(&options); + DBImpl* dbi = reinterpret_cast(db_); + + // Fill levels >= 1 so memtable compaction outputs to level 1 + for (int level = 1; level < dbi->NumberLevels(); level++) { + dbi->Put(WriteOptions(), "", "begin"); + dbi->Put(WriteOptions(), "~", "end"); + dbi->TEST_FlushMemTable(); + } + + Build(10); + dbi->TEST_FlushMemTable(); + dbi->TEST_WaitForCompact(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + bool failed = false; + for (int i = 0; i < 10000 && s.ok(); i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + if (!s.ok()) { + failed = true; + } + // if one write failed, every subsequent write must fail, too + ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db"; + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + Corrupt(kTableFile, 100, 1); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_FlushMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +TEST(CorruptionTest, FileSystemStateCorrupted) { + for (int iter = 0; iter < 2; ++iter) { + Options options; + options.paranoid_checks = true; + options.create_if_missing = true; + Reopen(&options); + Build(10); + ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = reinterpret_cast(db_); + std::vector metadata; + dbi->GetLiveFilesMetaData(&metadata); + ASSERT_GT(metadata.size(), size_t(0)); + std::string filename = dbname_ + metadata[0].name; + + delete db_; + db_ = nullptr; + + if (iter == 0) { // corrupt file size + unique_ptr file; + env_.NewWritableFile(filename, &file, EnvOptions()); + file->Append(Slice("corrupted sst")); + file.reset(); + } else { // delete the file + env_.DeleteFile(filename); + } + + Status x = TryReopen(&options); + ASSERT_TRUE(x.IsCorruption()); + DestroyDB(dbname_, options_); + Reopen(&options); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/db_bench.cc b/db/db_bench.cc new file mode 100644 index 00000000..17c5a9e5 --- /dev/null +++ b/db/db_bench.cc @@ -0,0 +1,2830 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "rocksdb/statistics.h" +#include "rocksdb/options.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/perf_context.h" +#include "port/port.h" +#include "util/bit_set.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stack_trace.h" +#include "util/string_util.h" +#include "util/statistics.h" +#include "util/testutil.h" +#include "hdfs/env_hdfs.h" +#include "utilities/merge_operators.h" + + +DEFINE_string(benchmarks, + + "fillseq," + "fillsync," + "fillrandom," + "overwrite," + "readrandom," + "readrandom," + "readseq," + "readreverse," + "compact," + "readrandom," + "readseq," + "readtocache," + "readreverse," + "readwhilewriting," + "readrandomwriterandom," + "updaterandom," + "randomwithverify," + "fill100K," + "crc32c," + "compress," + "uncompress," + "acquireload," + "fillfromstdin,", + + "Comma-separated list of operations to run in the specified order" + "Actual benchmarks:\n" + "\tfillseq -- write N values in sequential key" + " order in async mode\n" + "\tfillrandom -- write N values in random key order in async" + " mode\n" + "\toverwrite -- overwrite N values in random key order in" + " async mode\n" + "\tfillsync -- write N/100 values in random key order in " + "sync mode\n" + "\tfill100K -- write N/1000 100K values in random order in" + " async mode\n" + "\tdeleteseq -- delete N keys in sequential order\n" + "\tdeleterandom -- delete N keys in random order\n" + "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" + "\treadreverse -- read N times in reverse order\n" + "\treadrandom -- read N times in random order\n" + "\treadmissing -- read N missing keys in random order\n" + "\treadhot -- read N times in random order from 1% section " + "of DB\n" + "\treadwhilewriting -- 1 writer, N threads doing random " + "reads\n" + "\treadrandomwriterandom -- N threads doing random-read, " + "random-write\n" + "\tprefixscanrandom -- prefix scan N times in random order\n" + "\tupdaterandom -- N threads doing read-modify-write for random " + "keys\n" + "\tappendrandom -- N threads doing read-modify-write with " + "growing values\n" + "\tmergerandom -- same as updaterandom/appendrandom using merge" + " operator. " + "Must be used with merge_operator\n" + "\treadrandommergerandom -- perform N random read-or-merge " + "operations. Must be used with merge_operator\n" + "\tnewiterator -- repeated iterator creation\n" + "\tseekrandom -- N random seeks\n" + "\tcrc32c -- repeated crc32c of 4K of data\n" + "\tacquireload -- load N*1000 times\n" + "Meta operations:\n" + "\tcompact -- Compact the entire DB\n" + "\tstats -- Print DB stats\n" + "\tlevelstats -- Print the number of files and bytes per level\n" + "\tsstables -- Print sstable info\n" + "\theapprofile -- Dump a heap profile (if supported by this" + " port)\n"); + +DEFINE_int64(num, 1000000, "Number of key/values to place in database"); + +DEFINE_int64(numdistinct, 1000, + "Number of distinct keys to use. Used in RandomWithVerify to " + "read/write on fewer keys so that gets are more likely to find the" + " key and puts are more likely to update the same key"); + +DEFINE_int64(merge_keys, -1, + "Number of distinct keys to use for MergeRandom and " + "ReadRandomMergeRandom. " + "If negative, there will be FLAGS_num keys."); + +DEFINE_int64(reads, -1, "Number of read operations to do. " + "If negative, do FLAGS_num reads."); + +DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use" + " an iterator"); + +DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms"); + +DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); + +DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for" + " prefixscanrandom. If true, use_prefix_blooms must also be true."); + +DEFINE_int64(seed, 0, "Seed base for random number generators. " + "When 0 it is deterministic."); + +DEFINE_int32(threads, 1, "Number of concurrent threads to run."); + +DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." + " When 0 then num & reads determine the test duration"); + +DEFINE_int32(value_size, 100, "Size of each value"); + + +// the maximum size of key in bytes +static const int kMaxKeySize = 128; +static bool ValidateKeySize(const char* flagname, int32_t value) { + if (value > kMaxKeySize) { + fprintf(stderr, "Invalid value for --%s: %d, must be < %d\n", + flagname, value, kMaxKeySize); + return false; + } + return true; +} +DEFINE_int32(key_size, 16, "size of each key"); + +DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" + " to this fraction of their original size after compression"); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + rocksdb::Options().max_write_buffer_number, + "The number of in-memory memtables. Each memtable is of size" + "write_buffer_size."); + +DEFINE_int32(min_write_buffer_number_to_merge, + rocksdb::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together" + "before writing to storage. This is cheap because it is an" + "in-memory merge. If this feature is not enabled, then all these" + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check" + " in all of these files. Also, an in-memory merge may result in" + " writing less data to storage if there are duplicate records " + " in each of these individual write buffers."); + +DEFINE_int32(max_background_compactions, + rocksdb::Options().max_background_compactions, + "The maximum number of concurrent background compactions" + " that can occur in parallel."); + +DEFINE_int32(max_background_flushes, + rocksdb::Options().max_background_flushes, + "The maximum number of concurrent background flushes" + " that can occur in parallel."); + +static rocksdb::CompactionStyle FLAGS_compaction_style_e; +DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style, + "style of compaction: level-based vs universal"); + +DEFINE_int32(universal_size_ratio, 0, + "Percentage flexibility while comparing file size" + " (for universal compaction only)."); + +DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a" + " single compaction run (for universal compaction only)."); + +DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" + " in universal style compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int32(universal_compression_size_percent, -1, + "The percentage of the database to compress for universal " + "compaction. -1 means compress everything."); + +DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" + "data. Negative means use default settings."); + +DEFINE_int32(block_size, rocksdb::Options().block_size, + "Number of bytes in a block."); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data."); + +DEFINE_int32(open_files, rocksdb::Options().max_open_files, + "Maximum number of files to keep open at the same time" + " (use default if == 0)"); + +DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means" + " use default settings."); +DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. " + "Negative means no bloom filter."); + +DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing" + " database. If you set this flag and also specify a benchmark that" + " wants a fresh database, that benchmark will fail."); + +DEFINE_string(db, "", "Use the db with the following name."); + +static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) { + if (value >= 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(cache_numshardbits, -1, "Number of shards for the block cache" + " is 2 ** cache_numshardbits. Negative means use default settings." + " This is applied only if FLAGS_cache_size is non-negative."); + +DEFINE_int32(cache_remove_scan_count_limit, 32, ""); + +DEFINE_bool(verify_checksum, false, "Verify checksum for every block read" + " from storage"); + +DEFINE_bool(statistics, false, "Database statistics"); +static class std::shared_ptr dbstats; + +DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do" + " --num reads."); + +DEFINE_int32(writes_per_second, 0, "Per-thread rate limit on writes per second." + " No limit when <= 0. Only for the readwhilewriting test."); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(disable_data_sync, false, "If true, do not wait until data is" + " synced to disk."); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); + +DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when" + " randomread benchmark is used"); + +DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query" + " when read_range is > 1 and randomread benchmark is used"); + +DEFINE_int32(num_levels, 7, "The total number of levels"); + +DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); + +DEFINE_int32(target_file_size_multiplier, 1, + "A multiplier to compute target level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, 10 * 1048576, "Max bytes for level-1"); + +DEFINE_int32(max_bytes_for_level_multiplier, 10, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static std::vector FLAGS_max_bytes_for_level_multiplier_additional_v; +DEFINE_string(max_bytes_for_level_multiplier_additional, "", + "A vector that specifies additional fanout per level"); + +DEFINE_int32(level0_stop_writes_trigger, 12, "Number of files in level-0" + " that will trigger put stop."); + +DEFINE_int32(level0_slowdown_writes_trigger, 8, "Number of files in level-0" + " that will slow down writes."); + +DEFINE_int32(level0_file_num_compaction_trigger, 4, "Number of files in level-0" + " when compactions start"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value <= 0 || value>=100) { + fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed" + " as percentage) for the ReadRandomWriteRandom workload. The " + "default value 90 means 90% operations out of all reads and writes" + " operations are reads. In other words, 9 gets for every 1 put."); + +DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed" + " as percentage) for the ReadRandomMergeRandom workload. The" + " default value 70 means 70% out of all read and merge operations" + " are merges. In other words, 7 merges for every 3 gets."); + +DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" + "deletes (used in RandomWithVerify only). RandomWithVerify " + "calculates writepercent as (100 - FLAGS_readwritepercent - " + "deletepercent), so deletepercent must be smaller than (100 - " + "FLAGS_readwritepercent)"); + +DEFINE_int32(disable_seek_compaction, false, "Option to disable compaction" + " triggered by read."); + +DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete " + "obsolete files periodically. 0 means that obsolete files are" + " deleted after every compaction run."); + +enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return rocksdb::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return rocksdb::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return rocksdb::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return rocksdb::kBZip2Compression; + else if (!strcasecmp(ctype, "lz4")) + return rocksdb::kLZ4Compression; + else if (!strcasecmp(ctype, "lz4hc")) + return rocksdb::kLZ4HCCompression; + + fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); + return rocksdb::kSnappyCompression; //default value +} +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum rocksdb::CompressionType FLAGS_compression_type_e = + rocksdb::kSnappyCompression; + +DEFINE_int32(compression_level, -1, + "Compression level. For zlib this should be -1 for the " + "default level, or between 0 and 9."); + +static bool ValidateCompressionLevel(const char* flagname, int32_t value) { + if (value < -1 || value > 9) { + fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n", + flagname, value); + return false; + } + return true; +} + +static const bool FLAGS_compression_level_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_compression_level, + &ValidateCompressionLevel); + +DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts" + " from this level. Levels with number < min_level_to_compress are" + " not compressed. Otherwise, apply compression_type to " + "all levels."); + +static bool ValidateTableCacheNumshardbits(const char* flagname, + int32_t value) { + if (0 >= value || value > 20) { + fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val <= 20\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(table_cache_numshardbits, 4, ""); + +DEFINE_string(hdfs, "", "Name of hdfs environment"); +// posix or hdfs environment +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " + "this is greater than zero. When 0 the interval grows over time."); + +DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" + " this is greater than 0."); + +DEFINE_int32(perf_level, 0, "Level of perf collection"); + +static bool ValidateRateLimit(const char* flagname, double value) { + static constexpr double EPSILON = 1e-10; + if ( value < -EPSILON ) { + fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n", + flagname, value); + return false; + } + return true; +} +DEFINE_double(soft_rate_limit, 0.0, ""); + +DEFINE_double(hard_rate_limit, 0.0, "When not equal to 0 this make threads " + "sleep at each stats reporting interval until the compaction" + " score for all levels is less than or equal to this value."); + +DEFINE_int32(rate_limit_delay_max_milliseconds, 1000, + "When hard_rate_limit is set then this is the max time a put will" + " be stalled."); + +DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of " + "overlaps in grandparent (i.e., level+2) before we stop building a" + " single file in a level->level+1 compaction."); + +DEFINE_bool(readonly, false, "Run read only benchmarks."); + +DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); + +DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for" + " a compaction run that compacts Level-K with Level-(K+1) (for" + " K >= 1)"); + +DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); +DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files" + " in MB."); + +DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer, + "Allow buffered io using OS buffers"); + +DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes, + "Allow writes to occur via mmap-ing files"); + +DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open, + "Advise random access on table file open"); + +DEFINE_string(compaction_fadvice, "NORMAL", + "Access pattern advice when a file is compacted"); +static auto FLAGS_compaction_fadvice_e = + rocksdb::Options().access_hint_on_compaction_start; + +DEFINE_bool(use_multiget, false, + "Use multiget to access a series of keys instead of get"); + +DEFINE_bool(use_tailing_iterator, false, + "Use tailing iterator to access a series of keys instead of get"); + +DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number" + " of keys to group per call Arbitrary default is good because it" + " agrees with readwritepercent"); + +// TODO: Apply this flag to generic Get calls too. Currently only with Multiget +DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is" + " missing in a Get/MultiGet call"); + +DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex, + "Use adaptive mutex"); + +DEFINE_uint64(bytes_per_sync, rocksdb::Options().bytes_per_sync, + "Allows OS to incrementally sync files to disk while they are" + " being written, in the background. Issue one request for every" + " bytes_per_sync written. 0 turns it off."); +DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop" + " the delete if key not present"); + +DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge" + " operations on a key in the memtable"); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value>=2000000000) { + fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and " + "plain table"); +DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated " + "per prefix, 0 means no special handling of the prefix, " + "i.e. use the prefix comes with the generated random number."); + +enum RepFactory { + kSkipList, + kPrefixHash, + kVectorRep, + kHashLinkedList +}; +enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kPrefixHash; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + else if (!strcasecmp(ctype, "hash_linkedlist")) + return kHashLinkedList; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} +static enum RepFactory FLAGS_rep_factory; +DEFINE_string(memtablerep, "skip_list", ""); +DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count"); +DEFINE_bool(use_plain_table, false, "if use plain table " + "instead of block-based table format"); + +DEFINE_string(merge_operator, "", "The merge operator to use with the database." + "If a new merge operator is specified, be sure to use fresh" + " database The possible merge operators are defined in" + " utilities/merge_operators.h"); + +static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_soft_rate_limit, + &ValidateRateLimit); + +static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit); + +static const bool FLAGS_prefix_size_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +static const bool FLAGS_key_size_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize); + +static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_cache_numshardbits, + &ValidateCacheNumshardbits); + +static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_readwritepercent, + &ValidateInt32Percent); + +static const bool FLAGS_deletepercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_deletepercent, + &ValidateInt32Percent); +static const bool + FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_table_cache_numshardbits, + &ValidateTableCacheNumshardbits); + +namespace rocksdb { + +// Helper for quickly generating random data. +class RandomGenerator { + private: + std::string data_; + unsigned int pos_; + + public: + RandomGenerator() { + // We use a limited amount of data over and over again and ensure + // that it is larger than the compression window (32KB), and also + // large enough to serve all typical value sizes we want to write. + Random rnd(301); + std::string piece; + while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) { + // Add a short fragment that is as compressible as specified + // by FLAGS_compression_ratio. + test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); + data_.append(piece); + } + pos_ = 0; + } + + Slice Generate(unsigned int len) { + if (pos_ + len > data_.size()) { + pos_ = 0; + assert(len < data_.size()); + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +static void AppendWithSpace(std::string* str, Slice msg) { + if (msg.empty()) return; + if (!str->empty()) { + str->push_back(' '); + } + str->append(msg.data(), msg.size()); +} + +class Stats { + private: + int id_; + double start_; + double finish_; + double seconds_; + int64_t done_; + int64_t last_report_done_; + int64_t next_report_; + int64_t bytes_; + double last_op_finish_; + double last_report_finish_; + HistogramImpl hist_; + std::string message_; + bool exclude_from_merge_; + + public: + Stats() { Start(-1); } + + void Start(int id) { + id_ = id; + next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100; + last_op_finish_ = start_; + hist_.Clear(); + done_ = 0; + last_report_done_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = FLAGS_env->NowMicros(); + finish_ = start_; + last_report_finish_ = start_; + message_.clear(); + // When set, stats from this thread won't be merged with others. + exclude_from_merge_ = false; + } + + void Merge(const Stats& other) { + if (other.exclude_from_merge_) + return; + + hist_.Merge(other.hist_); + done_ += other.done_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + + // Just keep the messages from one thread + if (message_.empty()) message_ = other.message_; + } + + void Stop() { + finish_ = FLAGS_env->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void AddMessage(Slice msg) { + AppendWithSpace(&message_, msg); + } + + void SetId(int id) { id_ = id; } + void SetExcludeFromMerge() { exclude_from_merge_ = true; } + + void FinishedSingleOp(DB* db) { + if (FLAGS_histogram) { + double now = FLAGS_env->NowMicros(); + double micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000 && !FLAGS_stats_interval) { + fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); + fflush(stderr); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (!FLAGS_stats_interval) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, ""); + fflush(stderr); + } else { + double now = FLAGS_env->NowMicros(); + fprintf(stderr, + "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and " + "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(), + id_, + done_ - last_report_done_, done_, + (done_ - last_report_done_) / + ((now - last_report_finish_) / 1000000.0), + done_ / ((now - start_) / 1000000.0), + (now - last_report_finish_) / 1000000.0, + (now - start_) / 1000000.0); + + if (FLAGS_stats_per_interval) { + std::string stats; + if (db && db->GetProperty("rocksdb.stats", &stats)) + fprintf(stderr, "%s\n", stats.c_str()); + } + + fflush(stderr); + next_report_ += FLAGS_stats_interval; + last_report_finish_ = now; + last_report_done_ = done_; + } + } + } + + void AddBytes(int64_t n) { + bytes_ += n; + } + + void Report(const Slice& name) { + // Pretend at least one op was done in case we are running a benchmark + // that does not call FinishedSingleOp(). + if (done_ < 1) done_ = 1; + + std::string extra; + if (bytes_ > 0) { + // Rate is computed on actual elapsed time, not the sum of per-thread + // elapsed times. + double elapsed = (finish_ - start_) * 1e-6; + char rate[100]; + snprintf(rate, sizeof(rate), "%6.1f MB/s", + (bytes_ / 1048576.0) / elapsed); + extra = rate; + } + AppendWithSpace(&extra, message_); + double elapsed = (finish_ - start_) * 1e-6; + double throughput = (double)done_/elapsed; + + fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n", + name.ToString().c_str(), + elapsed * 1e6 / done_, + (long)throughput, + (extra.empty() ? "" : " "), + extra.c_str()); + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; + +// State shared by all concurrent executions of the same benchmark. +struct SharedState { + port::Mutex mu; + port::CondVar cv; + int total; + int perf_level; + + // Each thread goes through the following states: + // (1) initializing + // (2) waiting for others to be initialized + // (3) running + // (4) done + + long num_initialized; + long num_done; + bool start; + + SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { } +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + int tid; // 0..n-1 when running in n threads + Random64 rand; // Has different seeds for different threads + Stats stats; + SharedState* shared; + + /* implicit */ ThreadState(int index) + : tid(index), + rand((FLAGS_seed ? FLAGS_seed : 1000) + index) { + } +}; + +class Duration { + public: + Duration(int max_seconds, int64_t max_ops) { + max_seconds_ = max_seconds; + max_ops_= max_ops; + ops_ = 0; + start_at_ = FLAGS_env->NowMicros(); + } + + bool Done(int increment) { + if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops + ops_ += increment; + + if (max_seconds_) { + // Recheck every appx 1000 ops (exact iff increment is factor of 1000) + if ((ops_/1000) != ((ops_-increment)/1000)) { + double now = FLAGS_env->NowMicros(); + return ((now - start_at_) / 1000000.0) >= max_seconds_; + } else { + return false; + } + } else { + return ops_ > max_ops_; + } + } + + private: + int max_seconds_; + int64_t max_ops_; + int64_t ops_; + double start_at_; +}; + +class Benchmark { + private: + shared_ptr cache_; + shared_ptr compressed_cache_; + const FilterPolicy* filter_policy_; + const SliceTransform* prefix_extractor_; + DB* db_; + int64_t num_; + int value_size_; + int key_size_; + int prefix_size_; + int64_t keys_per_prefix_; + int entries_per_batch_; + WriteOptions write_options_; + int64_t reads_; + int64_t writes_; + int64_t readwrites_; + int64_t merge_keys_; + int heap_counter_; + void PrintHeader() { + PrintEnvironment(); + fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size); + fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + FLAGS_value_size, + static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); + fprintf(stdout, "Entries: %" PRIu64 "\n", num_); + fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size); + fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_); + fprintf(stdout, "RawSize: %.1f MB (estimated)\n", + ((static_cast(FLAGS_key_size + FLAGS_value_size) * num_) + / 1048576.0)); + fprintf(stdout, "FileSize: %.1f MB (estimated)\n", + (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio) + * num_) + / 1048576.0)); + fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second); + switch (FLAGS_compression_type_e) { + case rocksdb::kNoCompression: + fprintf(stdout, "Compression: none\n"); + break; + case rocksdb::kSnappyCompression: + fprintf(stdout, "Compression: snappy\n"); + break; + case rocksdb::kZlibCompression: + fprintf(stdout, "Compression: zlib\n"); + break; + case rocksdb::kBZip2Compression: + fprintf(stdout, "Compression: bzip2\n"); + break; + case rocksdb::kLZ4Compression: + fprintf(stdout, "Compression: lz4\n"); + break; + case rocksdb::kLZ4HCCompression: + fprintf(stdout, "Compression: lz4hc\n"); + break; + } + + switch (FLAGS_rep_factory) { + case kPrefixHash: + fprintf(stdout, "Memtablerep: prefix_hash\n"); + break; + case kSkipList: + fprintf(stdout, "Memtablerep: skip_list\n"); + break; + case kVectorRep: + fprintf(stdout, "Memtablerep: vector\n"); + break; + case kHashLinkedList: + fprintf(stdout, "Memtablerep: hash_linkedlist\n"); + break; + } + fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); + + PrintWarnings(); + fprintf(stdout, "------------------------------------------------\n"); + } + + void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" + ); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + if (FLAGS_compression_type_e != rocksdb::kNoCompression) { + // The test string should not be too small. + const int len = FLAGS_block_size; + char* text = (char*) malloc(len+1); + bool result = true; + const char* name = nullptr; + std::string compressed; + + memset(text, (int) 'y', len); + text[len] = '\0'; + switch (FLAGS_compression_type_e) { + case kSnappyCompression: + result = port::Snappy_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "Snappy"; + break; + case kZlibCompression: + result = port::Zlib_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "Zlib"; + break; + case kBZip2Compression: + result = port::BZip2_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "BZip2"; + break; + case kLZ4Compression: + result = port::LZ4_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "LZ4"; + break; + case kLZ4HCCompression: + result = port::LZ4HC_Compress(Options().compression_opts, text, + strlen(text), &compressed); + name = "LZ4HC"; + break; + case kNoCompression: + assert(false); // cannot happen + break; + } + + if (!result) { + fprintf(stdout, "WARNING: %s compression is not enabled\n", name); + } else if (name && compressed.size() >= strlen(text)) { + fprintf(stdout, "WARNING: %s compression is not effective\n", name); + } + + free(text); + } + } + +// Current the following isn't equivalent to OS_LINUX. +#if defined(__linux) + static Slice TrimSpace(Slice s) { + unsigned int start = 0; + while (start < s.size() && isspace(s[start])) { + start++; + } + unsigned int limit = s.size(); + while (limit > start && isspace(s[limit-1])) { + limit--; + } + return Slice(s.data() + start, limit - start); + } +#endif + + void PrintEnvironment() { + fprintf(stderr, "LevelDB: version %d.%d\n", + kMajorVersion, kMinorVersion); + +#if defined(__linux) + time_t now = time(nullptr); + fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline + + FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo != nullptr) { + char line[1000]; + int num_cpus = 0; + std::string cpu_type; + std::string cache_size; + while (fgets(line, sizeof(line), cpuinfo) != nullptr) { + const char* sep = strchr(line, ':'); + if (sep == nullptr) { + continue; + } + Slice key = TrimSpace(Slice(line, sep - 1 - line)); + Slice val = TrimSpace(Slice(sep + 1)); + if (key == "model name") { + ++num_cpus; + cpu_type = val.ToString(); + } else if (key == "cache size") { + cache_size = val.ToString(); + } + } + fclose(cpuinfo); + fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); + fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); + } +#endif + } + + public: + Benchmark() + : cache_(FLAGS_cache_size >= 0 ? + (FLAGS_cache_numshardbits >= 1 ? + NewLRUCache(FLAGS_cache_size, FLAGS_cache_numshardbits, + FLAGS_cache_remove_scan_count_limit) : + NewLRUCache(FLAGS_cache_size)) : nullptr), + compressed_cache_(FLAGS_compressed_cache_size >= 0 ? + (FLAGS_cache_numshardbits >= 1 ? + NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : + NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), + prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)), + db_(nullptr), + num_(FLAGS_num), + value_size_(FLAGS_value_size), + key_size_(FLAGS_key_size), + prefix_size_(FLAGS_prefix_size), + keys_per_prefix_(FLAGS_keys_per_prefix), + entries_per_batch_(1), + reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), + writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes), + readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : + ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) + ), + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), + heap_counter_(0) { + if (FLAGS_prefix_size > FLAGS_key_size) { + fprintf(stderr, "prefix size is larger than key size"); + exit(1); + } + + std::vector files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (unsigned int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + if (!FLAGS_use_existing_db) { + DestroyDB(FLAGS_db, Options()); + } + } + + ~Benchmark() { + delete db_; + delete filter_policy_; + delete prefix_extractor_; + } + + // Generate key according to the given specification and random number. + // The resulting key will have the following format (if keys_per_prefix_ + // is positive), extra trailing bytes are either cut off or paddd with '0'. + // The prefix value is derived from key value. + // ---------------------------- + // | prefix 00000 | key 00000 | + // ---------------------------- + // If keys_per_prefix_ is 0, the key is simply a binary representation of + // random number followed by trailing '0's + // ---------------------------- + // | key 00000 | + // ---------------------------- + std::string GenerateKeyFromInt(uint64_t v, int64_t num_keys) { + std::string key; + key.resize(key_size_); + char* start = &(key[0]); + char* pos = start; + if (keys_per_prefix_ > 0) { + int64_t num_prefix = num_keys / keys_per_prefix_; + int64_t prefix = v % num_prefix; + int bytes_to_fill = std::min(prefix_size_, 8); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast(&prefix), bytes_to_fill); + } + if (prefix_size_ > 8) { + // fill the rest with 0s + memset(pos + 8, '0', prefix_size_ - 8); + } + pos += prefix_size_; + } + + int bytes_to_fill = std::min(key_size_ - static_cast(pos - start), 8); + if (port::kLittleEndian) { + for (int i = 0; i < bytes_to_fill; ++i) { + pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; + } + } else { + memcpy(pos, static_cast(&v), bytes_to_fill); + } + pos += bytes_to_fill; + if (key_size_ > pos - start) { + memset(pos, '0', key_size_ - (pos - start)); + } + + return key; + } + + void Run() { + PrintHeader(); + Open(); + const char* benchmarks = FLAGS_benchmarks.c_str(); + while (benchmarks != nullptr) { + const char* sep = strchr(benchmarks, ','); + Slice name; + if (sep == nullptr) { + name = benchmarks; + benchmarks = nullptr; + } else { + name = Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + + // Sanitize parameters + num_ = FLAGS_num; + reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); + writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); + value_size_ = FLAGS_value_size; + key_size_ = FLAGS_key_size; + entries_per_batch_ = 1; + write_options_ = WriteOptions(); + if (FLAGS_sync) { + write_options_.sync = true; + } + write_options_.disableWAL = FLAGS_disable_wal; + + void (Benchmark::*method)(ThreadState*) = nullptr; + bool fresh_db = false; + int num_threads = FLAGS_threads; + + if (name == Slice("fillseq")) { + fresh_db = true; + method = &Benchmark::WriteSeq; + } else if (name == Slice("fillbatch")) { + fresh_db = true; + entries_per_batch_ = 1000; + method = &Benchmark::WriteSeq; + } else if (name == Slice("fillrandom")) { + fresh_db = true; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fillfromstdin")) { + fresh_db = true; + method = &Benchmark::WriteFromStdin; + } else if (name == Slice("filluniquerandom")) { + fresh_db = true; + if (num_threads > 1) { + fprintf(stderr, "filluniquerandom multithreaded not supported" + ", use 1 thread"); + num_threads = 1; + } + method = &Benchmark::WriteUniqueRandom; + } else if (name == Slice("overwrite")) { + fresh_db = false; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fillsync")) { + fresh_db = true; + num_ /= 1000; + write_options_.sync = true; + method = &Benchmark::WriteRandom; + } else if (name == Slice("fill100K")) { + fresh_db = true; + num_ /= 1000; + value_size_ = 100 * 1000; + method = &Benchmark::WriteRandom; + } else if (name == Slice("readseq")) { + method = &Benchmark::ReadSequential; + } else if (name == Slice("readtocache")) { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; + } else if (name == Slice("readreverse")) { + method = &Benchmark::ReadReverse; + } else if (name == Slice("readrandom")) { + method = &Benchmark::ReadRandom; + } else if (name == Slice("readmissing")) { + method = &Benchmark::ReadMissing; + } else if (name == Slice("newiterator")) { + method = &Benchmark::IteratorCreation; + } else if (name == Slice("seekrandom")) { + method = &Benchmark::SeekRandom; + } else if (name == Slice("readhot")) { + method = &Benchmark::ReadHot; + } else if (name == Slice("readrandomsmall")) { + reads_ /= 1000; + method = &Benchmark::ReadRandom; + } else if (name == Slice("prefixscanrandom")) { + method = &Benchmark::PrefixScanRandom; + } else if (name == Slice("deleteseq")) { + method = &Benchmark::DeleteSeq; + } else if (name == Slice("deleterandom")) { + method = &Benchmark::DeleteRandom; + } else if (name == Slice("readwhilewriting")) { + num_threads++; // Add extra thread for writing + method = &Benchmark::ReadWhileWriting; + } else if (name == Slice("readrandomwriterandom")) { + method = &Benchmark::ReadRandomWriteRandom; + } else if (name == Slice("readrandommergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + method = nullptr; + } else { + method = &Benchmark::ReadRandomMergeRandom; + } + } else if (name == Slice("updaterandom")) { + method = &Benchmark::UpdateRandom; + } else if (name == Slice("appendrandom")) { + method = &Benchmark::AppendRandom; + } else if (name == Slice("mergerandom")) { + if (FLAGS_merge_operator.empty()) { + fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", + name.ToString().c_str()); + method = nullptr; + } else { + method = &Benchmark::MergeRandom; + } + } else if (name == Slice("randomwithverify")) { + method = &Benchmark::RandomWithVerify; + } else if (name == Slice("compact")) { + method = &Benchmark::Compact; + } else if (name == Slice("crc32c")) { + method = &Benchmark::Crc32c; + } else if (name == Slice("acquireload")) { + method = &Benchmark::AcquireLoad; + } else if (name == Slice("compress")) { + method = &Benchmark::Compress; + } else if (name == Slice("uncompress")) { + method = &Benchmark::Uncompress; + } else if (name == Slice("heapprofile")) { + HeapProfile(); + } else if (name == Slice("stats")) { + PrintStats("rocksdb.stats"); + } else if (name == Slice("levelstats")) { + PrintStats("rocksdb.levelstats"); + } else if (name == Slice("sstables")) { + PrintStats("rocksdb.sstables"); + } else { + if (name != Slice()) { // No error message for empty name + fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + } + } + + if (fresh_db) { + if (FLAGS_use_existing_db) { + fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", + name.ToString().c_str()); + method = nullptr; + } else { + delete db_; + db_ = nullptr; + DestroyDB(FLAGS_db, Options()); + Open(); + } + } + + if (method != nullptr) { + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + RunBenchmark(num_threads, name, method); + } + } + if (FLAGS_statistics) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + } + + private: + struct ThreadArg { + Benchmark* bm; + SharedState* shared; + ThreadState* thread; + void (Benchmark::*method)(ThreadState*); + }; + + static void ThreadBody(void* v) { + ThreadArg* arg = reinterpret_cast(v); + SharedState* shared = arg->shared; + ThreadState* thread = arg->thread; + { + MutexLock l(&shared->mu); + shared->num_initialized++; + if (shared->num_initialized >= shared->total) { + shared->cv.SignalAll(); + } + while (!shared->start) { + shared->cv.Wait(); + } + } + + SetPerfLevel(static_cast (shared->perf_level)); + thread->stats.Start(thread->tid); + (arg->bm->*(arg->method))(thread); + thread->stats.Stop(); + + { + MutexLock l(&shared->mu); + shared->num_done++; + if (shared->num_done >= shared->total) { + shared->cv.SignalAll(); + } + } + } + + void RunBenchmark(int n, Slice name, + void (Benchmark::*method)(ThreadState*)) { + SharedState shared; + shared.total = n; + shared.num_initialized = 0; + shared.num_done = 0; + shared.start = false; + + ThreadArg* arg = new ThreadArg[n]; + for (int i = 0; i < n; i++) { + arg[i].bm = this; + arg[i].method = method; + arg[i].shared = &shared; + arg[i].thread = new ThreadState(i); + arg[i].thread->shared = &shared; + FLAGS_env->StartThread(ThreadBody, &arg[i]); + } + + shared.mu.Lock(); + while (shared.num_initialized < n) { + shared.cv.Wait(); + } + + shared.start = true; + shared.cv.SignalAll(); + while (shared.num_done < n) { + shared.cv.Wait(); + } + shared.mu.Unlock(); + + // Stats for some threads can be excluded. + Stats merge_stats; + for (int i = 0; i < n; i++) { + merge_stats.Merge(arg[i].thread->stats); + } + merge_stats.Report(name); + + for (int i = 0; i < n; i++) { + delete arg[i].thread; + } + delete[] arg; + } + + void Crc32c(ThreadState* thread) { + // Checksum about 500MB of data total + const int size = 4096; + const char* label = "(4K per op)"; + std::string data(size, 'x'); + int64_t bytes = 0; + uint32_t crc = 0; + while (bytes < 500 * 1048576) { + crc = crc32c::Value(data.data(), size); + thread->stats.FinishedSingleOp(nullptr); + bytes += size; + } + // Print so result is not dead + fprintf(stderr, "... crc=0x%x\r", static_cast(crc)); + + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(label); + } + + void AcquireLoad(ThreadState* thread) { + int dummy; + port::AtomicPointer ap(&dummy); + int count = 0; + void *ptr = nullptr; + thread->stats.AddMessage("(each op is 1000 loads)"); + while (count < 100000) { + for (int i = 0; i < 1000; i++) { + ptr = ap.Acquire_Load(); + } + count++; + thread->stats.FinishedSingleOp(nullptr); + } + if (ptr == nullptr) exit(1); // Disable unused variable warning. + } + + void Compress(ThreadState *thread) { + RandomGenerator gen; + Slice input = gen.Generate(Options().block_size); + int64_t bytes = 0; + int64_t produced = 0; + bool ok = true; + std::string compressed; + + // Compress 1G + while (ok && bytes < int64_t(1) << 30) { + switch (FLAGS_compression_type_e) { + case rocksdb::kSnappyCompression: + ok = port::Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kZlibCompression: + ok = port::Zlib_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kBZip2Compression: + ok = port::BZip2_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4Compression: + ok = port::LZ4_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4HCCompression: + ok = port::LZ4HC_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + default: + ok = false; + } + produced += compressed.size(); + bytes += input.size(); + thread->stats.FinishedSingleOp(nullptr); + } + + if (!ok) { + thread->stats.AddMessage("(compression failure)"); + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "(output: %.1f%%)", + (produced * 100.0) / bytes); + thread->stats.AddMessage(buf); + thread->stats.AddBytes(bytes); + } + } + + void Uncompress(ThreadState *thread) { + RandomGenerator gen; + Slice input = gen.Generate(Options().block_size); + std::string compressed; + + bool ok; + switch (FLAGS_compression_type_e) { + case rocksdb::kSnappyCompression: + ok = port::Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kZlibCompression: + ok = port::Zlib_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kBZip2Compression: + ok = port::BZip2_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4Compression: + ok = port::LZ4_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + case rocksdb::kLZ4HCCompression: + ok = port::LZ4HC_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); + break; + default: + ok = false; + } + + int64_t bytes = 0; + int decompress_size; + while (ok && bytes < 1024 * 1048576) { + char *uncompressed = nullptr; + switch (FLAGS_compression_type_e) { + case rocksdb::kSnappyCompression: + // allocate here to make comparison fair + uncompressed = new char[input.size()]; + ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), + uncompressed); + break; + case rocksdb::kZlibCompression: + uncompressed = port::Zlib_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + case rocksdb::kBZip2Compression: + uncompressed = port::BZip2_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + case rocksdb::kLZ4Compression: + uncompressed = port::LZ4_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + case rocksdb::kLZ4HCCompression: + uncompressed = port::LZ4_Uncompress( + compressed.data(), compressed.size(), &decompress_size); + ok = uncompressed != nullptr; + break; + default: + ok = false; + } + delete[] uncompressed; + bytes += input.size(); + thread->stats.FinishedSingleOp(nullptr); + } + + if (!ok) { + thread->stats.AddMessage("(compression failure)"); + } else { + thread->stats.AddBytes(bytes); + } + } + + void Open() { + assert(db_ == nullptr); + Options options; + options.create_if_missing = !FLAGS_use_existing_db; + options.block_cache = cache_; + options.block_cache_compressed = compressed_cache_; + if (cache_ == nullptr) { + options.no_block_cache = true; + } + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options.max_background_compactions = FLAGS_max_background_compactions; + options.max_background_flushes = FLAGS_max_background_flushes; + options.compaction_style = FLAGS_compaction_style_e; + options.block_size = FLAGS_block_size; + options.filter_policy = filter_policy_; + if (FLAGS_use_plain_table || FLAGS_use_prefix_blooms) { + options.prefix_extractor.reset( + NewFixedPrefixTransform(FLAGS_prefix_size)); + } + options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits; + options.bloom_locality = FLAGS_bloom_locality; + options.max_open_files = FLAGS_open_files; + options.statistics = dbstats; + options.env = FLAGS_env; + options.disableDataSync = FLAGS_disable_data_sync; + options.use_fsync = FLAGS_use_fsync; + options.wal_dir = FLAGS_wal_dir; + options.num_levels = FLAGS_num_levels; + options.target_file_size_base = FLAGS_target_file_size_base; + options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + options.filter_deletes = FLAGS_filter_deletes; + if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash || + FLAGS_rep_factory == kHashLinkedList)) { + fprintf(stderr, "prefix_size should be non-zero if PrefixHash or " + "HashLinkedList memtablerep is used\n"); + exit(1); + } + switch (FLAGS_rep_factory) { + case kPrefixHash: + options.memtable_factory.reset(NewHashSkipListRepFactory( + FLAGS_hash_bucket_count)); + break; + case kSkipList: + // no need to do anything + break; + case kHashLinkedList: + options.memtable_factory.reset(NewHashLinkListRepFactory( + FLAGS_hash_bucket_count)); + break; + case kVectorRep: + options.memtable_factory.reset( + new VectorRepFactory + ); + break; + } + if (FLAGS_use_plain_table) { + if (FLAGS_rep_factory != kPrefixHash && + FLAGS_rep_factory != kHashLinkedList) { + fprintf(stderr, "Waring: plain table is used with skipList\n"); + } + if (!FLAGS_mmap_read && !FLAGS_mmap_write) { + fprintf(stderr, "plain table format requires mmap to operate\n"); + exit(1); + } + + int bloom_bits_per_key = FLAGS_bloom_bits; + if (bloom_bits_per_key < 0) { + bloom_bits_per_key = 0; + } + options.table_factory = std::shared_ptr( + NewPlainTableFactory(FLAGS_key_size, bloom_bits_per_key, 0.75)); + } + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) { + if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() != + (unsigned int)FLAGS_num_levels) { + fprintf(stderr, "Insufficient number of fanouts specified %d\n", + (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size()); + exit(1); + } + options.max_bytes_for_level_multiplier_additional = + FLAGS_max_bytes_for_level_multiplier_additional_v; + } + options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options.compression = FLAGS_compression_type_e; + options.compression_opts.level = FLAGS_compression_level; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + if (FLAGS_min_level_to_compress >= 0) { + assert(FLAGS_min_level_to_compress <= FLAGS_num_levels); + options.compression_per_level.resize(FLAGS_num_levels); + for (int i = 0; i < FLAGS_min_level_to_compress; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = FLAGS_min_level_to_compress; + i < FLAGS_num_levels; i++) { + options.compression_per_level[i] = FLAGS_compression_type_e; + } + } + options.disable_seek_compaction = FLAGS_disable_seek_compaction; + options.delete_obsolete_files_period_micros = + FLAGS_delete_obsolete_files_period_micros; + options.soft_rate_limit = FLAGS_soft_rate_limit; + options.hard_rate_limit = FLAGS_hard_rate_limit; + options.rate_limit_delay_max_milliseconds = + FLAGS_rate_limit_delay_max_milliseconds; + options.table_cache_numshardbits = FLAGS_table_cache_numshardbits; + options.max_grandparent_overlap_factor = + FLAGS_max_grandparent_overlap_factor; + options.disable_auto_compactions = FLAGS_disable_auto_compactions; + options.source_compaction_factor = FLAGS_source_compaction_factor; + + // fill storage options + options.allow_os_buffer = FLAGS_bufferedio; + options.allow_mmap_reads = FLAGS_mmap_read; + options.allow_mmap_writes = FLAGS_mmap_write; + options.advise_random_on_open = FLAGS_advise_random_on_open; + options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e; + options.use_adaptive_mutex = FLAGS_use_adaptive_mutex; + options.bytes_per_sync = FLAGS_bytes_per_sync; + + // merge operator options + options.merge_operator = MergeOperators::CreateFromStringId( + FLAGS_merge_operator); + if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) { + fprintf(stderr, "invalid merge operator: %s\n", + FLAGS_merge_operator.c_str()); + exit(1); + } + options.max_successive_merges = FLAGS_max_successive_merges; + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + if (FLAGS_universal_compression_size_percent != -1) { + options.compaction_options_universal.compression_size_percent = + FLAGS_universal_compression_size_percent; + } + + Status s; + if(FLAGS_readonly) { + s = DB::OpenForReadOnly(options, FLAGS_db, &db_); + } else { + s = DB::Open(options, FLAGS_db, &db_); + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + if (FLAGS_min_level_to_compress >= 0) { + options.compression_per_level.clear(); + } + } + + enum WriteMode { + RANDOM, SEQUENTIAL, UNIQUE_RANDOM + }; + + void WriteSeq(ThreadState* thread) { + DoWrite(thread, SEQUENTIAL); + } + + void WriteRandom(ThreadState* thread) { + DoWrite(thread, RANDOM); + } + + void WriteUniqueRandom(ThreadState* thread) { + DoWrite(thread, UNIQUE_RANDOM); + } + + void writeOrFail(WriteBatch& batch) { + Status s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void WriteFromStdin(ThreadState* thread) { + size_t count = 0; + WriteBatch batch; + const size_t bufferLen = 32 << 20; + unique_ptr line = unique_ptr(new char[bufferLen]); + char* linep = line.get(); + const int batchSize = 100 << 10; + const char columnSeparator = '\t'; + const char lineSeparator = '\n'; + + while (fgets(linep, bufferLen, stdin) != nullptr) { + ++count; + char* tab = std::find(linep, linep + bufferLen, columnSeparator); + if (tab == linep + bufferLen) { + fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count); + continue; + } + Slice key(linep, tab - linep); + tab++; + char* endLine = std::find(tab, linep + bufferLen, lineSeparator); + if (endLine == linep + bufferLen) { + fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count); + continue; + } + Slice value(tab, endLine - tab); + thread->stats.FinishedSingleOp(db_); + thread->stats.AddBytes(endLine - linep - 1); + + if (batch.Count() < batchSize) { + batch.Put(key, value); + continue; + } + writeOrFail(batch); + batch.Clear(); + } + if (batch.Count() > 0) { + writeOrFail(batch); + } + } + + void DoWrite(ThreadState* thread, WriteMode write_mode) { + const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; + const int64_t num_ops = writes_ == 0 ? num_ : writes_; + Duration duration(test_duration, num_ops); + unique_ptr bit_set; + + if (write_mode == UNIQUE_RANDOM) { + bit_set.reset(new BitSet(num_ops)); + } + + if (num_ != FLAGS_num) { + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_); + thread->stats.AddMessage(msg); + } + + RandomGenerator gen; + WriteBatch batch; + Status s; + int64_t bytes = 0; + int i = 0; + while (!duration.Done(entries_per_batch_)) { + batch.Clear(); + for (int j = 0; j < entries_per_batch_; j++) { + int64_t k = 0; + switch(write_mode) { + case SEQUENTIAL: + k = i +j; + break; + case RANDOM: + k = thread->rand.Next() % FLAGS_num; + break; + case UNIQUE_RANDOM: + { + const int64_t t = thread->rand.Next() % FLAGS_num; + if (!bit_set->test(t)) { + // best case + k = t; + } else { + bool found = false; + // look forward + for (size_t i = t + 1; i < bit_set->size(); ++i) { + if (!bit_set->test(i)) { + found = true; + k = i; + break; + } + } + if (!found) { + for (size_t i = t; i-- > 0;) { + if (!bit_set->test(i)) { + found = true; + k = i; + break; + } + } + } + } + bit_set->set(k); + break; + } + }; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + batch.Put(key, gen.Generate(value_size_)); + bytes += value_size_ + key.size(); + thread->stats.FinishedSingleOp(db_); + } + s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + i += entries_per_batch_; + } + thread->stats.AddBytes(bytes); + } + + void ReadSequential(ThreadState* thread) { + Iterator* iter = db_->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + int64_t i = 0; + int64_t bytes = 0; + for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedSingleOp(db_); + ++i; + } + delete iter; + thread->stats.AddBytes(bytes); + } + + void ReadReverse(ThreadState* thread) { + Iterator* iter = db_->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + int64_t i = 0; + int64_t bytes = 0; + for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { + bytes += iter->key().size() + iter->value().size(); + thread->stats.FinishedSingleOp(db_); + ++i; + } + delete iter; + thread->stats.AddBytes(bytes); + } + + // Calls MultiGet over a list of keys from a random distribution. + // Returns the total number of keys found. + long MultiGetRandom(ReadOptions& options, int num_keys, + Random64* rand, int64_t range, const char* suffix) { + assert(num_keys > 0); + std::vector keys(num_keys); + std::vector values(num_keys); + std::vector gen_keys(num_keys); + + int i; + int64_t k; + + // Fill the keys vector + for(i=0; iNext() % range; + gen_keys[i] = GenerateKeyFromInt(k, range) + suffix; + keys[i] = gen_keys[i]; + } + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + // Apply the operation + std::vector statuses = db_->MultiGet(options, keys, &values); + assert((long)statuses.size() == num_keys); + assert((long)keys.size() == num_keys); // Should always be the case. + assert((long)values.size() == num_keys); + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + options.snapshot = nullptr; + } + + // Count number found + long found = 0; + for(i=0; irand, FLAGS_num, ""); + thread->stats.FinishedSingleOp(db_); + keys_left -= num_keys; + } + } else if (FLAGS_use_tailing_iterator) { // use tailing iterator for gets + options.tailing = true; + Iterator* iter = db_->NewIterator(options); + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + + iter->Seek(key); + read++; + if (iter->Valid() && iter->key().compare(Slice(key)) == 0) { + found++; + } + + thread->stats.FinishedSingleOp(db_); + } + delete iter; + } else { // Regular case. Do one "get" at a time Get + options.tailing = true; + options.prefix_seek = (FLAGS_prefix_size == 0); + Iterator* iter = db_->NewIterator(options); + std::string value; + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_read_range < 2) { + read++; + if (db_->Get(options, key, &value).ok()) { + found++; + } + } else { + int count = 1; + + if (FLAGS_get_approx) { + std::string key2 = + GenerateKeyFromInt(k + static_cast(FLAGS_read_range), + FLAGS_num + FLAGS_read_range); + Range range(key, key2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + read += FLAGS_read_range; + for (iter->Seek(key); + iter->Valid() && count <= FLAGS_read_range; + ++count, iter->Next()) { + found++; + } + } + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + options.snapshot = nullptr; + } + + thread->stats.FinishedSingleOp(db_); + } + + delete iter; + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", + found, read); + + thread->stats.AddMessage(msg); + + if (FLAGS_perf_level > 0) { + thread->stats.AddMessage(perf_context.ToString()); + } + } + + void PrefixScanRandom(ThreadState* thread) { + if (FLAGS_use_prefix_api) { + assert(FLAGS_use_prefix_blooms); + assert(FLAGS_bloom_bits >= 1); + } + + ReadOptions options(FLAGS_verify_checksum, true); + Duration duration(FLAGS_duration, reads_); + + int64_t found = 0; + + while (!duration.Done(1)) { + std::string value; + const int k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + Slice skey(key); + Slice prefix = prefix_extractor_->Transform(skey); + options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr; + + Iterator* iter = db_->NewIterator(options); + for (iter->Seek(skey); + iter->Valid() && iter->key().starts_with(prefix); + iter->Next()) { + found++; + } + delete iter; + + thread->stats.FinishedSingleOp(db_); + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", + found, reads_); + thread->stats.AddMessage(msg); + } + + void ReadMissing(ThreadState* thread) { + FLAGS_warn_missing_keys = false; // Never warn about missing keys + + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + + if (FLAGS_use_multiget) { + const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group + long keys_left = reads_; + + // Recalculate number of keys per group, and call MultiGet until done + long num_keys; + long found; + while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { + found = + MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "."); + + // We should not find any key since the key we try to get has a + // different suffix + if (found) { + assert(false); + } + + thread->stats.FinishedSingleOp(db_); + keys_left -= num_keys; + } + } else { // Regular case (not MultiGet) + std::string value; + Status s; + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num) + "."; + s = db_->Get(options, key, &value); + assert(!s.ok() && s.IsNotFound()); + thread->stats.FinishedSingleOp(db_); + } + } + } + + void ReadHot(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + const int64_t range = (FLAGS_num + 99) / 100; + int64_t found = 0; + + if (FLAGS_use_multiget) { + const int64_t kpg = FLAGS_keys_per_multiget; // keys per multiget group + int64_t keys_left = reads_; + + // Recalculate number of keys per group, and call MultiGet until done + long num_keys; + while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { + found += MultiGetRandom(options, num_keys, &thread->rand, range, ""); + thread->stats.FinishedSingleOp(db_); + keys_left -= num_keys; + } + } else { + std::string value; + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % range; + std::string key = GenerateKeyFromInt(k, range); + if (db_->Get(options, key, &value).ok()) { + ++found; + } + thread->stats.FinishedSingleOp(db_); + } + } + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", + found, reads_); + thread->stats.AddMessage(msg); + } + + void IteratorCreation(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + while (!duration.Done(1)) { + Iterator* iter = db_->NewIterator(options); + delete iter; + thread->stats.FinishedSingleOp(db_); + } + } + + void SeekRandom(ThreadState* thread) { + Duration duration(FLAGS_duration, reads_); + ReadOptions options(FLAGS_verify_checksum, true); + std::string value; + int64_t found = 0; + while (!duration.Done(1)) { + Iterator* iter = db_->NewIterator(options); + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + iter->Seek(key); + if (iter->Valid() && iter->key() == Slice(key)) found++; + delete iter; + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", + found, num_); + thread->stats.AddMessage(msg); + } + + void DoDelete(ThreadState* thread, bool seq) { + WriteBatch batch; + Status s; + Duration duration(seq ? 0 : FLAGS_duration, num_); + long i = 0; + while (!duration.Done(entries_per_batch_)) { + batch.Clear(); + for (int j = 0; j < entries_per_batch_; j++) { + const int64_t k = seq ? i+j : (thread->rand.Next() % FLAGS_num); + std::string key = GenerateKeyFromInt(k, FLAGS_num); + batch.Delete(key); + thread->stats.FinishedSingleOp(db_); + } + s = db_->Write(write_options_, &batch); + if (!s.ok()) { + fprintf(stderr, "del error: %s\n", s.ToString().c_str()); + exit(1); + } + ++i; + } + } + + void DeleteSeq(ThreadState* thread) { + DoDelete(thread, true); + } + + void DeleteRandom(ThreadState* thread) { + DoDelete(thread, false); + } + + void ReadWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + ReadRandom(thread); + } else { + // Special thread that keeps writing until other threads are done. + RandomGenerator gen; + double last = FLAGS_env->NowMicros(); + int writes_per_second_by_10 = 0; + int num_writes = 0; + + // --writes_per_second rate limit is enforced per 100 milliseconds + // intervals to avoid a burst of writes at the start of each second. + + if (FLAGS_writes_per_second > 0) + writes_per_second_by_10 = FLAGS_writes_per_second / 10; + + // Don't merge stats from this thread with the readers. + thread->stats.SetExcludeFromMerge(); + + while (true) { + { + MutexLock l(&thread->shared->mu); + if (thread->shared->num_done + 1 >= thread->shared->num_initialized) { + // Other threads have finished + break; + } + } + + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + + ++num_writes; + if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { + double now = FLAGS_env->NowMicros(); + double usecs_since_last = now - last; + + num_writes = 0; + last = now; + + if (usecs_since_last < 100000.0) { + FLAGS_env->SleepForMicroseconds(100000.0 - usecs_since_last); + last = FLAGS_env->NowMicros(); + } + } + } + } + } + + // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status PutMany(const WriteOptions& writeoptions, + const Slice& key, const Slice& value) { + std::string suffixes[3] = {"2", "1", "0"}; + std::string keys[3]; + + WriteBatch batch; + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Put(keys[i], value); + } + + s = db_->Write(writeoptions, &batch); + return s; + } + + + // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V) + // in DB atomically i.e in a single batch. Also refer GetMany. + Status DeleteMany(const WriteOptions& writeoptions, + const Slice& key) { + std::string suffixes[3] = {"1", "2", "0"}; + std::string keys[3]; + + WriteBatch batch; + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + batch.Delete(keys[i]); + } + + s = db_->Write(writeoptions, &batch); + return s; + } + + // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2" + // in the same snapshot, and verifies that all the values are identical. + // ASSUMES that PutMany was used to put (K, V) into the DB. + Status GetMany(const ReadOptions& readoptions, + const Slice& key, std::string* value) { + std::string suffixes[3] = {"0", "1", "2"}; + std::string keys[3]; + Slice key_slices[3]; + std::string values[3]; + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db_->GetSnapshot(); + Status s; + for (int i = 0; i < 3; i++) { + keys[i] = key.ToString() + suffixes[i]; + key_slices[i] = keys[i]; + s = db_->Get(readoptionscopy, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + } else { + values[i] = *value; + } + } + db_->ReleaseSnapshot(readoptionscopy.snapshot); + + if ((values[0] != values[1]) || (values[1] != values[2])) { + fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n", + key.ToString().c_str(), values[0].c_str(), values[1].c_str(), + values[2].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + + return s; + } + + // Differs from readrandomwriterandom in the following ways: + // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs. + // (b) Does deletes as well (per FLAGS_deletepercent) + // (c) In order to achieve high % of 'found' during lookups, and to do + // multiple writes (including puts and deletes) it uses upto + // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys. + // (d) Does not have a MultiGet option. + void RandomWithVerify(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + int get_weight = 0; + int put_weight = 0; + int delete_weight = 0; + int64_t gets_done = 0; + int64_t puts_done = 0; + int64_t deletes_done = 0; + + // the number of iterations is the larger of read_ or write_ + for (int64_t i = 0; i < readwrites_; i++) { + const int64_t k = thread->rand.Next() % (FLAGS_numdistinct); + std::string key = GenerateKeyFromInt(k, FLAGS_numdistinct); + if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + delete_weight = FLAGS_deletepercent; + put_weight = 100 - get_weight - delete_weight; + } + if (get_weight > 0) { + // do all the gets first + Status s = GetMany(options, key, &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "getmany error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + get_weight--; + gets_done++; + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = PutMany(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "putmany error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + puts_done++; + } else if (delete_weight > 0) { + Status s = DeleteMany(write_options_, key); + if (!s.ok()) { + fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str()); + exit(1); + } + delete_weight--; + deletes_done++; + } + + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \ + PRIu64 " found:%" PRIu64 ")", + gets_done, puts_done, deletes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // This is different from ReadWhileWriting because it does not use + // an extra thread. + void ReadRandomWriteRandom(ThreadState* thread) { + if (FLAGS_use_multiget){ + // Separate function for multiget (for ease of reading) + ReadRandomWriteRandomMultiGet(thread); + return; + } + + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + int get_weight = 0; + int put_weight = 0; + int64_t reads_done = 0; + int64_t writes_done = 0; + Duration duration(FLAGS_duration, readwrites_); + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + if (get_weight == 0 && put_weight == 0) { + // one batch completed, reinitialize for next batch + get_weight = FLAGS_readwritepercent; + put_weight = 100 - get_weight; + } + if (get_weight > 0) { + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_get_approx) { + std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1); + Range range(key, key2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + // do all the gets first + Status s = db_->Get(options, key, &value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + found++; + } + + get_weight--; + reads_done++; + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + } + + } else if (put_weight > 0) { + // then do all the corresponding number of puts + // for all the gets we have done earlier + Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + put_weight--; + writes_done++; + } + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \ + " total:%" PRIu64 " found:%" PRIu64 ")", + reads_done, writes_done, readwrites_, found); + thread->stats.AddMessage(msg); + } + + // ReadRandomWriteRandom (with multiget) + // Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts. + // FLAGS_readwritepercent will specify the ratio of gets to puts. + // e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75 + // Then each block will do 100 multigets and 33 puts + // So there are 133 operations in-total: 100 of them (75%) are gets, and 33 + // of them (25%) are puts. + void ReadRandomWriteRandomMultiGet(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + + // For multiget + const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group + + long keys_left = readwrites_; // number of keys still left to read + long num_keys; // number of keys to read in current group + long num_put_keys; // number of keys to put in current group + + int64_t found = 0; + int64_t reads_done = 0; + int64_t writes_done = 0; + int64_t multigets_done = 0; + + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while(true) { + // Read num_keys keys, then write num_put_keys keys. + // The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent + // And num_keys is set to be FLAGS_keys_per_multiget (kpg) + // num_put_keys is calculated accordingly (to maintain the ratio) + // Note: On the final iteration, num_keys and num_put_keys will be smaller + num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg); + num_put_keys = num_keys * (100-FLAGS_readwritepercent) + / FLAGS_readwritepercent; + + // This will break the loop when duration is complete + if (duration.Done(num_keys + num_put_keys)) { + break; + } + + // A quick check to make sure our formula doesn't break on edge cases + assert(num_keys >= 1); + assert(num_keys + num_put_keys <= keys_left); + + // Apply the MultiGet operations + found += MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, ""); + ++multigets_done; + reads_done+=num_keys; + thread->stats.FinishedSingleOp(db_); + + // Now do the puts + int i; + int64_t k; + for(i=0; irand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + Status s = db_->Put(write_options_, key, + gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + writes_done++; + thread->stats.FinishedSingleOp(db_); + } + + keys_left -= (num_keys + num_put_keys); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 \ + " multiget_ops:%" PRIu64 " found:%" PRIu64 ")", + reads_done, writes_done, readwrites_, multigets_done, found); + thread->stats.AddMessage(msg); + } + + // + // Read-modify-write for random keys + void UpdateRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + Duration duration(FLAGS_duration, readwrites_); + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_get_approx) { + std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1); + Range range(key, key2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + if (db_->Get(options, key, &value).ok()) { + found++; + } + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + } + + Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys. + // Each operation causes the key grow by value_size (simulating an append). + // Generally used for benchmarking against merges of similar type + void AppendRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t found = 0; + + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % FLAGS_num; + std::string key = GenerateKeyFromInt(k, FLAGS_num); + + if (FLAGS_use_snapshot) { + options.snapshot = db_->GetSnapshot(); + } + + if (FLAGS_get_approx) { + std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1); + Range range(key, key2); + uint64_t sizes; + db_->GetApproximateSizes(&range, 1, &sizes); + } + + // Get the existing value + if (db_->Get(options, key, &value).ok()) { + found++; + } else { + // If not existing, then just assume an empty string of data + value.clear(); + } + + if (FLAGS_use_snapshot) { + db_->ReleaseSnapshot(options.snapshot); + } + + // Update the value (by appending data) + Slice operand = gen.Generate(value_size_); + if (value.size() > 0) { + // Use a delimeter to match the semantics for StringAppendOperator + value.append(1,','); + } + value.append(operand.data(), operand.size()); + + // Write back to the database + Status s = db_->Put(write_options_, key, value); + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); + thread->stats.AddMessage(msg); + } + + // Read-modify-write for random keys (using MergeOperator) + // The merge operator to use should be defined by FLAGS_merge_operator + // Adjust FLAGS_value_size so that the keys are reasonable for this operator + // Assumes that the merge operator is non-null (i.e.: is well-defined) + // + // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8 + // to simulate random additions over 64-bit integers using merge. + // + // The number of merges on the same key can be controlled by adjusting + // FLAGS_merge_keys. + void MergeRandom(ThreadState* thread) { + RandomGenerator gen; + + // The number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % merge_keys_; + std::string key = GenerateKeyFromInt(k, merge_keys_); + + Status s = db_->Merge(write_options_, key, gen.Generate(value_size_)); + + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + thread->stats.FinishedSingleOp(db_); + } + + // Print some statistics + char msg[100]; + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_); + thread->stats.AddMessage(msg); + } + + // Read and merge random keys. The amount of reads and merges are controlled + // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct + // keys (and thus also the number of reads and merges on the same key) can be + // adjusted with FLAGS_merge_keys. + // + // As with MergeRandom, the merge operator to use should be defined by + // FLAGS_merge_operator. + void ReadRandomMergeRandom(ThreadState* thread) { + ReadOptions options(FLAGS_verify_checksum, true); + RandomGenerator gen; + std::string value; + int64_t num_hits = 0; + int64_t num_gets = 0; + int64_t num_merges = 0; + size_t max_length = 0; + + // the number of iterations is the larger of read_ or write_ + Duration duration(FLAGS_duration, readwrites_); + + while (!duration.Done(1)) { + const int64_t k = thread->rand.Next() % merge_keys_; + std::string key = GenerateKeyFromInt(k, merge_keys_); + + bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; + + if (do_merge) { + Status s = db_->Merge(write_options_, key, gen.Generate(value_size_)); + if (!s.ok()) { + fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); + exit(1); + } + + num_merges++; + + } else { + Status s = db_->Get(options, key, &value); + if (value.length() > max_length) + max_length = value.length(); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (!s.IsNotFound()) { + num_hits++; + } + + num_gets++; + + } + + thread->stats.FinishedSingleOp(db_); + } + char msg[100]; + snprintf(msg, sizeof(msg), + "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \ + PRIu64 " maxlength:%zu)", + num_gets, num_merges, readwrites_, num_hits, max_length); + thread->stats.AddMessage(msg); + } + + + void Compact(ThreadState* thread) { + db_->CompactRange(nullptr, nullptr); + } + + void PrintStats(const char* key) { + std::string stats; + if (!db_->GetProperty(key, &stats)) { + stats = "(failed)"; + } + fprintf(stdout, "\n%s\n", stats.c_str()); + } + + static void WriteToFile(void* arg, const char* buf, int n) { + reinterpret_cast(arg)->Append(Slice(buf, n)); + } + + void HeapProfile() { + char fname[100]; + EnvOptions soptions; + snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(), + ++heap_counter_); + unique_ptr file; + Status s = FLAGS_env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.ToString().c_str()); + return; + } + bool ok = port::GetHeapProfile(WriteToFile, file.get()); + if (!ok) { + fprintf(stderr, "heap profiling not supported\n"); + FLAGS_env->DeleteFile(fname); + } + } +}; + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::InstallStackTraceHandler(); + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style; + if (FLAGS_statistics) { + dbstats = rocksdb::CreateDBStatistics(); + } + + std::vector fanout = + rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ','); + for (unsigned int j= 0; j < fanout.size(); j++) { + FLAGS_max_bytes_for_level_multiplier_additional_v.push_back( + std::stoi(fanout[j])); + } + + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + + if (!FLAGS_hdfs.empty()) { + FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs); + } + + if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE")) + FLAGS_compaction_fadvice_e = rocksdb::Options::NONE; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL")) + FLAGS_compaction_fadvice_e = rocksdb::Options::NORMAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL")) + FLAGS_compaction_fadvice_e = rocksdb::Options::SEQUENTIAL; + else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED")) + FLAGS_compaction_fadvice_e = rocksdb::Options::WILLNEED; + else { + fprintf(stdout, "Unknown compaction fadvice:%s\n", + FLAGS_compaction_fadvice.c_str()); + } + + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); + + // The number of background threads should be at least as much the + // max number of concurrent compactions. + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + // Choose a location for the test database if none given with --db= + if (FLAGS_db.empty()) { + std::string default_db_path; + rocksdb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbbench"; + FLAGS_db = default_db_path; + } + + rocksdb::Benchmark benchmark; + benchmark.Run(); + return 0; +} diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc new file mode 100644 index 00000000..aed8615a --- /dev/null +++ b/db/db_filesnapshot.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/sync_point.h" + +namespace rocksdb { + +Status DBImpl::DisableFileDeletions() { + MutexLock l(&mutex_); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + // if not, it has already been disabled, so don't log anything + Log(options_.info_log, "File Deletions Disabled"); + } + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + DeletionState deletion_state; + bool should_purge_files = false; + { + MutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + Log(options_.info_log, "File Deletions Enabled"); + should_purge_files = true; + FindObsoleteFiles(deletion_state, true); + } + } + if (should_purge_files) { + PurgeObsoleteFiles(deletion_state); + } + LogFlush(options_.info_log); + return Status::OK(); +} + +Status DBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool flush_memtable) { + + *manifest_file_size = 0; + + if (flush_memtable) { + // flush all dirty data to disk. + Status status = Flush(FlushOptions()); + if (!status.ok()) { + Log(options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + MutexLock l(&mutex_); + + // Make a set of all of the live *.sst files + std::set live; + versions_->current()->AddLiveFiles(&live); + + ret.clear(); + ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST + + // create names of the live files. The names are not absolute + // paths, instead they are relative to dbname_; + for (auto live_file : live) { + ret.push_back(TableFileName("", live_file)); + } + + ret.push_back(CurrentFileName("")); + ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber())); + + // find length of manifest file while holding the mutex lock + *manifest_file_size = versions_->ManifestFileSize(); + + return Status::OK(); +} + +Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. + Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1"); + TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. + std::string archivedir = ArchivalDirectory(options_.wal_dir); + if (env_->FileExists(archivedir)) { + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + Log(options_.info_log, "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + Log(options_.info_log, "%s already moved to archive", + log->PathName().c_str()); + } + } + + return s; +} + +} diff --git a/db/db_impl.cc b/db/db_impl.cc new file mode 100644 index 00000000..72010ede --- /dev/null +++ b/db/db_impl.cc @@ -0,0 +1,4474 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" + +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/prefix_filter_iterator.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/tailing_iter.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "port/likely.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/merger.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "util/auto_roll_logger.h" +#include "util/autovector.h" +#include "util/build_version.h" +#include "util/coding.h" +#include "util/hash_skiplist_rep.h" +#include "util/hash_linklist_rep.h" +#include "util/logging.h" +#include "util/log_buffer.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" +#include "util/sync_point.h" + +namespace rocksdb { + +int DBImpl::SuperVersion::dummy = 0; +void* const DBImpl::SuperVersion::kSVInUse = &DBImpl::SuperVersion::dummy; +void* const DBImpl::SuperVersion::kSVObsolete = nullptr; + +void DumpLeveldbBuildVersion(Logger * log); + +// Information kept for every waiting writer +struct DBImpl::Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool done; + port::CondVar cv; + + explicit Writer(port::Mutex* mu) : cv(mu) { } +}; + +struct DBImpl::CompactionState { + Compaction* const compaction; + + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots; + + // Files produced by compaction + struct Output { + uint64_t number; + uint64_t file_size; + InternalKey smallest, largest; + SequenceNumber smallest_seqno, largest_seqno; + }; + std::vector outputs; + std::list allocated_file_numbers; + + // State kept for output being generated + unique_ptr outfile; + unique_ptr builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size()-1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + total_bytes(0) { + } + + // Create a client visible context of this compaction + CompactionFilter::Context GetFilterContextV1() { + CompactionFilter::Context context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + + // Create a client visible context of this compaction + CompactionFilterContext GetFilterContext() { + CompactionFilterContext context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + + std::vector key_buf_; + std::vector existing_value_buf_; + std::vector key_str_buf_; + std::vector existing_value_str_buf_; + // new_value_buf_ will only be appended if a value changes + std::vector new_value_buf_; + // if values_changed_buf_[i] is true + // new_value_buf_ will add a new entry with the changed value + std::vector value_changed_buf_; + // to_delete_buf_[i] is true iff key_buf_[i] is deleted + std::vector to_delete_buf_; + // buffer for the parsed internal keys, the string buffer is backed + // by key_str_buf_ + std::vector ikey_buf_; + + std::vector other_key_buf_; + std::vector other_value_buf_; + std::vector other_key_str_buf_; + std::vector other_value_str_buf_; + + std::vector combined_key_buf_; + std::vector combined_value_buf_; + + std::string cur_prefix_; + + // Buffers the kv-pair that will be run through compaction filter V2 + // in the future. + void BufferKeyValueSlices(const Slice& key, const Slice& value) { + key_str_buf_.emplace_back(key.ToString()); + existing_value_str_buf_.emplace_back(value.ToString()); + key_buf_.emplace_back(Slice(key_str_buf_.back())); + existing_value_buf_.emplace_back(Slice(existing_value_str_buf_.back())); + + ParsedInternalKey ikey; + ParseInternalKey(key_buf_.back(), &ikey); + ikey_buf_.emplace_back(ikey); + } + + // Buffers the kv-pair that will not be run through compaction filter V2 + // in the future. + void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) { + other_key_str_buf_.emplace_back(key.ToString()); + other_value_str_buf_.emplace_back(value.ToString()); + other_key_buf_.emplace_back(Slice(other_key_str_buf_.back())); + other_value_buf_.emplace_back(Slice(other_value_str_buf_.back())); + } + + // Add a kv-pair to the combined buffer + void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) { + // The real strings are stored in the batch buffers + combined_key_buf_.emplace_back(key); + combined_value_buf_.emplace_back(value); + } + + // Merging the two buffers + void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) { + size_t i = 0; + size_t j = 0; + size_t total_size = key_buf_.size() + other_key_buf_.size(); + combined_key_buf_.reserve(total_size); + combined_value_buf_.reserve(total_size); + + while (i + j < total_size) { + int comp_res = 0; + if (i < key_buf_.size() && j < other_key_buf_.size()) { + comp_res = comparator->Compare(key_buf_[i], other_key_buf_[j]); + } else if (i >= key_buf_.size() && j < other_key_buf_.size()) { + comp_res = 1; + } else if (j >= other_key_buf_.size() && i < key_buf_.size()) { + comp_res = -1; + } + if (comp_res > 0) { + AddToCombinedKeyValueSlices(other_key_buf_[j], other_value_buf_[j]); + j++; + } else if (comp_res < 0) { + AddToCombinedKeyValueSlices(key_buf_[i], existing_value_buf_[i]); + i++; + } + } + } + + void CleanupBatchBuffer() { + to_delete_buf_.clear(); + key_buf_.clear(); + existing_value_buf_.clear(); + key_str_buf_.clear(); + existing_value_str_buf_.clear(); + new_value_buf_.clear(); + value_changed_buf_.clear(); + ikey_buf_.clear(); + + to_delete_buf_.shrink_to_fit(); + key_buf_.shrink_to_fit(); + existing_value_buf_.shrink_to_fit(); + key_str_buf_.shrink_to_fit(); + existing_value_str_buf_.shrink_to_fit(); + new_value_buf_.shrink_to_fit(); + value_changed_buf_.shrink_to_fit(); + ikey_buf_.shrink_to_fit(); + + other_key_buf_.clear(); + other_value_buf_.clear(); + other_key_str_buf_.clear(); + other_value_str_buf_.clear(); + other_key_buf_.shrink_to_fit(); + other_value_buf_.shrink_to_fit(); + other_key_str_buf_.shrink_to_fit(); + other_value_str_buf_.shrink_to_fit(); + } + + void CleanupMergedBuffer() { + combined_key_buf_.clear(); + combined_value_buf_.clear(); + combined_key_buf_.shrink_to_fit(); + combined_value_buf_.shrink_to_fit(); + } +}; + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} +Options SanitizeOptions(const std::string& dbname, + const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const Options& src) { + Options result = src; + result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + ClipToRange(&result.max_open_files, 20, 1000000); + } + ClipToRange(&result.write_buffer_size, ((size_t)64)<<10, + ((size_t)64)<<30); + ClipToRange(&result.block_size, 1<<10, 4<<20); + + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = result.write_buffer_size / 10; + } + + result.min_write_buffer_number_to_merge = std::min( + result.min_write_buffer_number_to_merge, result.max_write_buffer_number-1); + if (result.info_log == nullptr) { + Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env, + result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + } + } + if (result.block_cache == nullptr && !result.no_block_cache) { + result.block_cache = NewLRUCache(8 << 20); + } + result.compression_per_level = src.compression_per_level; + if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { + result.block_size_deviation = 0; + } + if (result.max_mem_compaction_level >= result.num_levels) { + result.max_mem_compaction_level = result.num_levels - 1; + } + if (result.soft_rate_limit > result.hard_rate_limit) { + result.soft_rate_limit = result.hard_rate_limit; + } + if (result.compaction_filter) { + Log(result.info_log, "Compaction filter specified, ignore factory"); + } + if (result.prefix_extractor) { + Log(result.info_log, "prefix extractor %s in use.", + result.prefix_extractor->Name()); + } else { + assert(result.memtable_factory); + Slice name = result.memtable_factory->Name(); + if (name.compare("HashSkipListRepFactory") == 0 || + name.compare("HashLinkListRepFactory") == 0) { + Log(result.info_log, "prefix extractor is not provided while using %s. " + "fallback to skiplist", name.ToString().c_str()); + result.memtable_factory = std::make_shared(); + } + } + + if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + + // -- Sanitize the table properties collector + // All user defined properties collectors will be wrapped by + // UserKeyTablePropertiesCollector since for them they only have the + // knowledge of the user keys; internal keys are invisible to them. + auto& collectors = result.table_properties_collectors; + for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) { + assert(collectors[i]); + collectors[i] = + std::make_shared(collectors[i]); + } + + // Add collector to collect internal key statistics + collectors.push_back( + std::make_shared() + ); + + return result; +} + +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + // If the use has specified a different compression level for each level, + // then pick the compresison for that level. + if (!options.compression_per_level.empty()) { + const int n = options.compression_per_level.size() - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level_ is beyond the end of the + // specified compression levels, use the last value. + return options.compression_per_level[std::max(0, std::min(level, n))]; + } else { + return options.compression; + } +} + +CompressionType GetCompressionFlush(const Options& options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + + bool can_compress; + + if (options.compaction_style == kCompactionStyleUniversal) { + can_compress = + (options.compaction_options_universal.compression_size_percent < 0); + } else { + // For leveled compress when min_level_to_compress == 0. + can_compress = (GetCompressionType(options, 0, true) != kNoCompression); + } + + if (can_compress) { + return options.compression; + } else { + return kNoCompression; + } +} + +DBImpl::DBImpl(const Options& options, const std::string& dbname) + : env_(options.env), + dbname_(dbname), + internal_comparator_(options.comparator), + options_(SanitizeOptions(dbname, &internal_comparator_, + &internal_filter_policy_, options)), + internal_filter_policy_(options.filter_policy), + owns_info_log_(options_.info_log != options.info_log), + db_lock_(nullptr), + mutex_(options.use_adaptive_mutex), + shutting_down_(nullptr), + bg_cv_(&mutex_), + mem_(new MemTable(internal_comparator_, options_)), + imm_(options_.min_write_buffer_number_to_merge), + logfile_number_(0), + super_version_(nullptr), + super_version_number_(0), + local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + tmp_batch_(), + bg_schedule_needed_(false), + bg_compaction_scheduled_(0), + bg_manual_only_(0), + bg_flush_scheduled_(0), + bg_logstats_scheduled_(false), + manual_compaction_(nullptr), + logger_(nullptr), + disable_delete_obsolete_files_(0), + delete_obsolete_files_last_run_(options.env->NowMicros()), + purge_wal_files_last_run_(0), + last_stats_dump_time_microsec_(0), + default_interval_to_delete_obsolete_WAL_(600), + flush_on_destroy_(false), + internal_stats_(options.num_levels, options.env, + options.statistics.get()), + delayed_writes_(0), + storage_options_(options), + bg_work_gate_closed_(false), + refitting_level_(false), + opened_successfully_(false) { + mem_->Ref(); + env_->GetAbsolutePath(dbname, &db_absolute_path_); + + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = + (options_.max_open_files == -1) ? + 4194304 : options_.max_open_files - 10; + table_cache_.reset(new TableCache(dbname_, &options_, + storage_options_, table_cache_size)); + versions_.reset(new VersionSet(dbname_, &options_, storage_options_, + table_cache_.get(), &internal_comparator_)); + + DumpLeveldbBuildVersion(options_.info_log.get()); + options_.Dump(options_.info_log.get()); + + char name[100]; + Status s = env_->GetHostName(name, 100L); + if (s.ok()) { + host_name_ = name; + } else { + Log(options_.info_log, "Can't get hostname, use localhost as host name."); + host_name_ = "localhost"; + } + last_log_ts = 0; + + LogFlush(options_.info_log); +} + +DBImpl::~DBImpl() { + // Wait for background work to finish + if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) { + FlushMemTable(FlushOptions()); + } + + mutex_.Lock(); + shutting_down_.Release_Store(this); // Any non-nullptr value is ok + while (bg_compaction_scheduled_ || + bg_flush_scheduled_ || + bg_logstats_scheduled_) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + + // Release SuperVersion reference kept in ThreadLocalPtr. + // This must be done outside of mutex_ since unref handler can lock mutex. + // It also needs to be done after FlushMemTable, which can trigger local_sv_ + // access. + delete local_sv_; + + mutex_.Lock(); + if (options_.allow_thread_local) { + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + DeletionState deletion_state; + FindObsoleteFiles(deletion_state, true); + // manifest number starting from 2 + deletion_state.manifest_file_number = 1; + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + } + } + + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } + mutex_.Unlock(); + + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + + autovector to_delete; + imm_.current()->Unref(&to_delete); + for (MemTable* m: to_delete) { + delete m; + } + // versions need to be destroyed before table_cache since it can holds + // references to table_cache. + versions_.reset(); + LogFlush(options_.info_log); +} + +// Do not flush and close database elegantly. Simulate a crash. +void DBImpl::TEST_Destroy_DBImpl() { + // ensure that no new memtable flushes can occur + flush_on_destroy_ = false; + + // wait till all background compactions are done. + mutex_.Lock(); + while (bg_compaction_scheduled_ || + bg_flush_scheduled_ || + bg_logstats_scheduled_) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + + // Release SuperVersion reference kept in ThreadLocalPtr. + // This must be done outside of mutex_ since unref handler can lock mutex. + // It also needs to be done after FlushMemTable, which can trigger local_sv_ + // access. + delete local_sv_; + + mutex_.Lock(); + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } + + // Prevent new compactions from occuring. + bg_work_gate_closed_ = true; + const int LargeNumber = 10000000; + bg_compaction_scheduled_ += LargeNumber; + + mutex_.Unlock(); + LogFlush(options_.info_log); + + // force release the lock file. + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + log_.reset(); + versions_.reset(); + table_cache_.reset(); +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->ManifestFileNumber(); +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + new_db.SetComparatorName(user_comparator()->Name()); + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize(options_.manifest_preallocation_size); + { + log::Writer log(std::move(file)); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(manifest); + } + return s; +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || options_.paranoid_checks) { + // No change needed + } else { + Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(options_.wal_dir); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = options_.statistics.get(); + if (dbstats) { + Log(options_.info_log, + "STATISTCS:\n %s", + dbstats->ToString().c_str()); + } +} + +void DBImpl::MaybeDumpStats() { + if (options_.stats_dump_period_sec == 0) return; + + const uint64_t now_micros = env_->NowMicros(); + + if (last_stats_dump_time_microsec_ + + options_.stats_dump_period_sec * 1000000 + <= now_micros) { + // Multiple threads could race in here simultaneously. + // However, the last one will update last_stats_dump_time_microsec_ + // atomically. We could see more than one dump during one dump + // period in rare cases. + last_stats_dump_time_microsec_ = now_micros; + std::string stats; + GetProperty("rocksdb.stats", &stats); + Log(options_.info_log, "%s", stats.c_str()); + PrintStatistics(); + } +} + +// DBImpl::SuperVersion methods +DBImpl::SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool DBImpl::SuperVersion::Unref() { + assert(refs > 0); + // fetch_sub returns the previous value of yoeref + return refs.fetch_sub(1, std::memory_order_relaxed) == 1; +} + +void DBImpl::SuperVersion::Cleanup() { + db->mutex_.AssertHeld(); + assert(refs.load(std::memory_order_relaxed) == 0); + imm->Unref(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void DBImpl::SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current) { + db->mutex_.AssertHeld(); + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm->Ref(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +// Returns the list of live files in 'sst_live' and the list +// of all files in the filesystem in 'candidate_files'. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, + bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figurint out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = env_->NowMicros(); + if (delete_obsolete_files_last_run_ + + options_.delete_obsolete_files_period_micros < now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // get obsolete files + versions_->GetObsoleteFiles(&deletion_state.sst_delete_files); + + // store the current filenum, lognum, etc + deletion_state.manifest_file_number = versions_->ManifestFileNumber(); + deletion_state.pending_manifest_file_number = + versions_->PendingManifestFileNumber(); + deletion_state.log_number = versions_->LogNumber(); + deletion_state.prev_log_number = versions_->PrevLogNumber(); + + if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) { + // avoid filling up sst_live if we're sure that we + // are not going to do the full scan and that we don't have + // anything to delete at the moment + return; + } + + // don't delete live files + deletion_state.sst_live.assign(pending_outputs_.begin(), + pending_outputs_.end()); + versions_->AddLiveFiles(&deletion_state.sst_live); + + if (doing_the_full_scan) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + env_->GetChildren( + dbname_, &deletion_state.candidate_files + ); // Ignore errors + + //Add log files in wal_dir + if (options_.wal_dir != dbname_) { + std::vector log_files; + env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors + deletion_state.candidate_files.insert( + deletion_state.candidate_files.end(), + log_files.begin(), + log_files.end() + ); + } + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are posibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(DeletionState& state) { + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // this checks if FindObsoleteFiles() was run before. If not, don't do + // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also + // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + if (state.manifest_file_number == 0) { + return; + } + + // Now, convert live list to an unordered set, WITHOUT mutex held; + // set is slow. + std::unordered_set sst_live(state.sst_live.begin(), + state.sst_live.end()); + + auto& candidate_files = state.candidate_files; + candidate_files.reserve( + candidate_files.size() + + state.sst_delete_files.size() + + state.log_delete_files.size()); + // We may ignore the dbname when generating the file names. + const char* kDumbDbName = ""; + for (auto file : state.sst_delete_files) { + candidate_files.push_back( + TableFileName(kDumbDbName, file->number).substr(1) + ); + delete file; + } + + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.push_back(LogFileName(kDumbDbName, file_num).substr(1)); + } + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + sort(candidate_files.begin(), candidate_files.end()); + candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + std::vector old_info_log_files; + + for (const auto& to_delete : candidate_files) { + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number)); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + keep = (sst_live.find(number) != sst_live.end()); + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + keep = (sst_live.find(number) != sst_live.end()) || + (number == state.pending_manifest_file_number); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + keep = true; + break; + } + + if (keep) { + continue; + } + + if (type == kTableFile) { + // evict from cache + table_cache_->Evict(number); + } + + std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) + + "/" + to_delete; + if (type == kLogFile && + (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { + auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); + Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); + Log(options_.info_log, + "Move log file %s to %s -- %s\n", + fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); + } else { + Status s = env_->DeleteFile(fname); + Log(options_.info_log, "Delete %s type=%d #%lu -- %s\n", + fname.c_str(), type, (unsigned long)number, + s.ToString().c_str()); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + // NOTE: Currently we only support log purge when options_.db_log_dir is + // located in `dbname` directory. + if (old_info_log_file_count >= options_.keep_log_file_num && + options_.db_log_dir.empty()) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = old_info_log_file_count - options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + Log(options_.info_log, "Delete info log file %s\n", to_delete.c_str()); + Status s = env_->DeleteFile(dbname_ + "/" + to_delete); + if (!s.ok()) { + Log(options_.info_log, "Delete info log file %s FAILED -- %s\n", + to_delete.c_str(), s.ToString().c_str()); + } + } + } + PurgeObsoleteWALFiles(); + LogFlush(options_.info_log); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + DeletionState deletion_state; + FindObsoleteFiles(deletion_state, true); + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } +} + +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void DBImpl::PurgeObsoleteWALFiles() { + bool const ttl_enabled = options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time; + Status s = env_->GetCurrentTime(¤t_time); + if (!s.ok()) { + Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? + options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(options_.wal_dir); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + Status const s = env_->GetFileModificationTime(file_path, + &file_m_time); + if (!s.ok()) { + Log(options_.info_log, "Can't get file mod time: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { + Status const s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + Status const s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + Log(options_.info_log, "Can't get file size: %s: %s", + file_path.c_str(), s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + Status s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = options_.WAL_size_limit_MB * + 1024 * 1024 / log_file_size; + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + + if (files_del_num > archived_logs.size()) { + Log(options_.info_log, "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + Status const s = DeleteFile(file_path); + if (!s.ok()) { + Log(options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + } +} + +Status DBImpl::Recover(bool read_only, bool error_if_log_file_exist) { + mutex_.AssertHeld(); + + assert(db_lock_ == nullptr); + if (!read_only) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env_->CreateDirIfMissing(dbname_); + if (!s.ok()) { + return s; + } + + s = env_->NewDirectory(dbname_, &db_directory_); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + if (!env_->FileExists(CurrentFileName(dbname_))) { + if (options_.create_if_missing) { + // TODO: add merge_operator name check + s = NewDB(); + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + dbname_, "does not exist (create_if_missing is false)"); + } + } else { + if (options_.error_if_exists) { + return Status::InvalidArgument( + dbname_, "exists (error_if_exists is true)"); + } + } + // Check for the IDENTITY file and create it if not there + if (!env_->FileExists(IdentityFileName(dbname_))) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } + } + + Status s = versions_->Recover(); + if (options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok()) { + SequenceNumber max_sequence(0); + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that PrevLogNumber() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + const uint64_t min_log = versions_->LogNumber(); + const uint64_t prev_log = versions_->PrevLogNumber(); + std::vector filenames; + s = env_->GetChildren(options_.wal_dir, &filenames); + if (!s.ok()) { + return s; + } + + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) + && type == kLogFile + && ((number >= min_log) || (number == prev_log))) { + logs.push_back(number); + } + } + + if (logs.size() > 0 && error_if_log_file_exist) { + return Status::Corruption("" + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } + + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (const auto& log : logs) { + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log); + s = RecoverLogFile(log, &max_sequence, read_only); + } + + if (s.ok()) { + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + versions_->LastSequence()); + } + } + + return s; +} + +Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, + bool read_only) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if options_.paranoid_checks==false or + // options_.skip_log_error_on_recovery==true + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) *this->status = s; + } + }; + + mutex_.AssertHeld(); + + VersionEdit edit; + + // Open the log file + std::string fname = LogFileName(options_.wal_dir, log_number); + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = (options_.paranoid_checks && + !options_.skip_log_error_on_recovery ? &status : nullptr); + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Log(options_.info_log, "Recovering log #%lu", + (unsigned long) log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + bool memtable_empty = true; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + + status = WriteBatchInternal::InsertInto(&batch, mem_, &options_); + memtable_empty = false; + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } + const SequenceNumber last_seq = + WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (!read_only && mem_->ShouldFlush()) { + status = WriteLevel0TableForRecovery(mem_, &edit); + // we still want to clear memtable, even if the recovery failed + delete mem_->Unref(); + mem_ = new MemTable(internal_comparator_, options_); + mem_->Ref(); + memtable_empty = true; + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + } + } + + if (!memtable_empty && !read_only) { + status = WriteLevel0TableForRecovery(mem_, &edit); + delete mem_->Unref(); + mem_ = new MemTable(internal_comparator_, options_); + mem_->Ref(); + if (!status.ok()) { + return status; + } + } + + if (edit.NumEntries() > 0) { + // if read_only, NumEntries() will be 0 + assert(!read_only); + // writing log number in the manifest means that any log file + // with number strongly less than (log_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered log_number, we want all logs + // with numbers `<= log_number` (includes this one) to be ignored + edit.SetLogNumber(log_number + 1); + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any log + // number + versions_->MarkFileNumberUsed(log_number + 1); + status = versions_->LogAndApply(&edit, &mutex_); + } + + return status; +} + +Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.number); + Iterator* iter = mem->NewIterator(); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(options_.info_log, "Level-0 table #%lu: started", + (unsigned long) meta.number); + + Status s; + { + mutex_.Unlock(); + s = BuildTable(dbname_, env_, options_, storage_options_, + table_cache_.get(), iter, &meta, internal_comparator_, + newest_snapshot, earliest_seqno_in_memtable, + GetCompressionFlush(options_)); + LogFlush(options_.info_log); + mutex_.Lock(); + } + + Log(options_.info_log, "Level-0 table #%lu: %lu bytes %s", + (unsigned long) meta.number, + (unsigned long) meta.file_size, + s.ToString().c_str()); + delete iter; + + pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + InternalStats::CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + stats.files_out_levelnp1 = 1; + internal_stats_.AddCompactionStats(level, stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + return s; +} + + +Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, + uint64_t* filenumber, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + meta.number = versions_->NewFileNumber(); + *filenumber = meta.number; + pending_outputs_.insert(meta.number); + + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mems[0]->GetFirstSequenceNumber(); + Version* base = versions_->current(); + base->Ref(); // it is likely that we do not need this reference + Status s; + { + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + std::vector memtables; + for (MemTable* m : mems) { + Log(options_.info_log, + "Flushing memtable with log file: %lu\n", + (unsigned long)m->GetLogNumber()); + memtables.push_back(m->NewIterator()); + } + Iterator* iter = NewMergingIterator( + env_, &internal_comparator_, &memtables[0], memtables.size()); + Log(options_.info_log, + "Level-0 flush table #%lu: started", + (unsigned long)meta.number); + + s = BuildTable(dbname_, env_, options_, storage_options_, + table_cache_.get(), iter, &meta, internal_comparator_, + newest_snapshot, earliest_seqno_in_memtable, + GetCompressionFlush(options_)); + LogFlush(options_.info_log); + delete iter; + Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", + (unsigned long) meta.number, + (unsigned long) meta.file_size, + s.ToString().c_str()); + if (!options_.disableDataSync) { + db_directory_->Fsync(); + } + mutex_.Lock(); + } + base->Unref(); + + // re-acquire the most current version + base = versions_->current(); + + // There could be multiple threads writing to its own level-0 file. + // The pending_outputs cannot be cleared here, otherwise this newly + // created file might not be considered as a live-file by another + // compaction thread that is concurrently deleting obselete files. + // The pending_outputs can be cleared only after the new version is + // committed so that other threads can recognize this file as a + // valid one. + // pending_outputs_.erase(meta.number); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.file_size > 0) { + const Slice min_user_key = meta.smallest.user_key(); + const Slice max_user_key = meta.largest.user_key(); + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + if (base != nullptr && options_.max_background_compactions <= 1 && + options_.compaction_style == kCompactionStyleLevel) { + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); + } + edit->AddFile(level, meta.number, meta.file_size, + meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + InternalStats::CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.file_size; + internal_stats_.AddCompactionStats(level, stats); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + return s; +} + +Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + assert(imm_.size() != 0); + assert(imm_.IsFlushPending()); + + // Save the contents of the earliest memtable as a new Table + uint64_t file_number; + autovector mems; + imm_.PickMemtablesToFlush(&mems); + if (mems.empty()) { + LogToBuffer(log_buffer, "Nothing in memstore to flush"); + return Status::OK(); + } + + // record the logfile_number_ before we release the mutex + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems[0]; + VersionEdit* edit = m->GetEdits(); + edit->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit->SetLogNumber(mems.back()->GetNextLogNumber()); + + std::vector logs_to_delete; + for (auto mem : mems) { + logs_to_delete.push_back(mem->GetLogNumber()); + } + + // This will release and re-acquire the mutex. + Status s = WriteLevel0Table(mems, edit, &file_number, log_buffer); + + if (s.ok() && shutting_down_.Acquire_Load()) { + s = Status::ShutdownInProgress( + "Database shutdown started during memtable compaction"); + } + + if (!s.ok()) { + imm_.RollbackMemtableFlush(mems, file_number, &pending_outputs_); + } else { + // Replace immutable memtable with the generated Table + s = imm_.InstallMemtableFlushResults( + mems, versions_.get(), &mutex_, options_.info_log.get(), file_number, + pending_outputs_, &deletion_state.memtables_to_free, + db_directory_.get(), log_buffer); + } + + if (s.ok()) { + InstallSuperVersion(deletion_state); + if (madeProgress) { + *madeProgress = 1; + } + + MaybeScheduleLogDBDeployStats(); + + if (disable_delete_obsolete_files_ == 0) { + // add to deletion state + deletion_state.log_delete_files.insert( + deletion_state.log_delete_files.end(), + logs_to_delete.begin(), + logs_to_delete.end()); + } + } + + if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks && + bg_error_.ok()) { + // if a bad error happened (not ShutdownInProgress) and paranoid_checks is + // true, mark DB read-only + bg_error_ = s; + } + return s; +} + +Status DBImpl::CompactRange(const Slice* begin, + const Slice* end, + bool reduce_level, + int target_level) { + Status s = FlushMemTable(FlushOptions()); + if (!s.ok()) { + LogFlush(options_.info_log); + return s; + } + + int max_level_with_files = 1; + { + MutexLock l(&mutex_); + Version* base = versions_->current(); + for (int level = 1; level < NumberLevels(); level++) { + if (base->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + } + for (int level = 0; level <= max_level_with_files; level++) { + // in case the compaction is unversal or if we're compacting the + // bottom-most level, the output level will be the same as input one + if (options_.compaction_style == kCompactionStyleUniversal || + level == max_level_with_files) { + s = RunManualCompaction(level, level, begin, end); + } else { + s = RunManualCompaction(level, level + 1, begin, end); + } + if (!s.ok()) { + LogFlush(options_.info_log); + return s; + } + } + + if (reduce_level) { + s = ReFitLevel(max_level_with_files, target_level); + } + LogFlush(options_.info_log); + + return s; +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting(int level) { + mutex_.AssertHeld(); + Version* current = versions_->current(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (current->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break; + + minimum_level = i; + } + return minimum_level; +} + +Status DBImpl::ReFitLevel(int level, int target_level) { + assert(level < NumberLevels()); + + SuperVersion* superversion_to_free = nullptr; + SuperVersion* new_superversion = new SuperVersion(); + + mutex_.Lock(); + + // only allow one thread refitting + if (refitting_level_) { + mutex_.Unlock(); + Log(options_.info_log, "ReFitLevel: another thread is refitting"); + delete new_superversion; + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + // wait for all background threads to stop + bg_work_gate_closed_ = true; + while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { + Log(options_.info_log, + "RefitLevel: waiting for background threads to stop: %d %d", + bg_compaction_scheduled_, bg_flush_scheduled_); + bg_cv_.Wait(); + } + + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(level); + } + + assert(to_level <= level); + + Status status; + if (to_level < level) { + Log(options_.info_log, "Before refitting:\n%s", + versions_->current()->DebugString().data()); + + VersionEdit edit; + for (const auto& f : versions_->current()->files_[level]) { + edit.DeleteFile(level, f->number); + edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + } + Log(options_.info_log, "Apply version edit:\n%s", + edit.DebugString().data()); + + status = versions_->LogAndApply(&edit, &mutex_, db_directory_.get()); + superversion_to_free = InstallSuperVersion(new_superversion); + new_superversion = nullptr; + + Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); + + if (status.ok()) { + Log(options_.info_log, "After refitting:\n%s", + versions_->current()->DebugString().data()); + } + } + + refitting_level_ = false; + bg_work_gate_closed_ = false; + + mutex_.Unlock(); + delete superversion_to_free; + delete new_superversion; + return status; +} + +int DBImpl::NumberLevels() { + return options_.num_levels; +} + +int DBImpl::MaxMemCompactionLevel() { + return options_.max_mem_compaction_level; +} + +int DBImpl::Level0StopWriteTrigger() { + return options_.level0_stop_writes_trigger; +} + +uint64_t DBImpl::CurrentVersionNumber() const { + return super_version_number_.load(); +} + +Status DBImpl::Flush(const FlushOptions& options) { + return FlushMemTable(options); +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +Status DBImpl::GetUpdatesSince( + SequenceNumber seq, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) { + + RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS); + if (seq > versions_->LastSequence()) { + return Status::NotFound( + "Requested sequence not yet written in the db"); + } + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_, + read_options, storage_options_, + seq, std::move(wal_files), this)); + return (*iter)->status(); +} + +Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + long start = 0; // signed to avoid overflow when target is < first file. + long end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + long mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + size_t start_index = std::max(0l, end); // end could be -ve. + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +bool DBImpl::CheckWalFileExistsAndEmpty(const WalFileType type, + const uint64_t number) { + const std::string fname = (type == kAliveLogFile) ? + LogFileName(options_.wal_dir, number) : + ArchivedLogFileName(options_.wal_dir, number); + uint64_t file_size; + Status s = env_->GetFileSize(fname, &file_size); + return (s.ok() && (file_size == 0)); +} + +Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, + WriteBatch* const result) { + + if (type == kAliveLogFile) { + std::string fname = LogFileName(options_.wal_dir, number); + Status status = ReadFirstLine(fname, result); + if (status.ok() || env_->FileExists(fname)) { + // return OK or any error that is not caused non-existing file + return status; + } + + // check if the file got moved to archive. + std::string archived_file = + ArchivedLogFileName(options_.wal_dir, number); + Status s = ReadFirstLine(archived_file, result); + if (s.ok() || env_->FileExists(archived_file)) { + return s; + } + return Status::NotFound("Log File has been deleted: " + archived_file); + } else if (type == kArchivedLogFile) { + std::string fname = ArchivedLogFileName(options_.wal_dir, number); + Status status = ReadFirstLine(fname, result); + return status; + } + return Status::NotSupported("File Type Not Known: " + std::to_string(type)); +} + +Status DBImpl::ReadFirstLine(const std::string& fname, + WriteBatch* const batch) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + + Status* status; + bool ignore_error; // true if options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "%s%s: dropping %d bytes; %s", + (this->ignore_error ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status->ok()) { + // only keep the first error + *this->status = s; + } + } + }; + + unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, storage_options_); + + if (!status.ok()) { + return status; + } + + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = &status; + reporter.ignore_error = !options_.paranoid_checks; + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && + (status.ok() || !options_.paranoid_checks)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + // TODO read record's till the first no corrupt entry? + } else { + WriteBatchInternal::SetContents(batch, record); + return Status::OK(); + } + } + + // ReadRecord returns false on EOF, which is deemed as OK() by Reader + if (status.ok()) { + status = Status::Corruption("eof reached"); + } + return status; +} + +struct CompareLogByPointer { + bool operator() (const unique_ptr& a, + const unique_ptr& b) { + LogFileImpl* a_impl = dynamic_cast(a.get()); + LogFileImpl* b_impl = dynamic_cast(b.get()); + return *a_impl < *b_impl; + } +}; + +Status DBImpl::GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(all_files.size()); + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile){ + + WriteBatch batch; + Status s = ReadFirstRecord(log_type, number, &batch); + if (!s.ok()) { + if (CheckWalFileExistsAndEmpty(log_type, number)) { + continue; + } + return s; + } + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + if (!s.ok()) { + return s; + } + + log_files.push_back(std::move(unique_ptr(new LogFileImpl( + number, log_type, WriteBatchInternal::Sequence(&batch), size_bytes)))); + } + } + CompareLogByPointer compare_log_files; + std::sort(log_files.begin(), log_files.end(), compare_log_files); + return status; +} + +Status DBImpl::RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end) { + assert(input_level >= 0); + + InternalKey begin_storage, end_storage; + + ManualCompaction manual; + manual.input_level = input_level; + manual.output_level = output_level; + manual.done = false; + manual.in_progress = false; + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + options_.compaction_style == kCompactionStyleUniversal) { + manual.begin = nullptr; + } else { + begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); + manual.begin = &begin_storage; + } + if (end == nullptr || + options_.compaction_style == kCompactionStyleUniversal) { + manual.end = nullptr; + } else { + end_storage = InternalKey(*end, 0, static_cast(0)); + manual.end = &end_storage; + } + + MutexLock l(&mutex_); + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // bg_manual_only_ is non-zero when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + ++bg_manual_only_; + while (bg_compaction_scheduled_ > 0) { + Log(options_.info_log, + "Manual compaction waiting for all other scheduled background " + "compactions to finish"); + bg_cv_.Wait(); + } + + Log(options_.info_log, "Manual compaction starting"); + + while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + assert(bg_manual_only_ > 0); + if (manual_compaction_ != nullptr) { + // Running either this or some other manual compaction + bg_cv_.Wait(); + } else { + manual_compaction_ = &manual; + MaybeScheduleFlushOrCompaction(); + } + } + + assert(!manual.in_progress); + assert(bg_manual_only_ > 0); + --bg_manual_only_; + return manual.status; +} + +Status DBImpl::TEST_CompactRange(int level, + const Slice* begin, + const Slice* end) { + int output_level = (options_.compaction_style == kCompactionStyleUniversal) + ? level + : level + 1; + return RunManualCompaction(level, output_level, begin, end); +} + +Status DBImpl::FlushMemTable(const FlushOptions& options) { + // nullptr batch means just wait for earlier writes to be done + Status s = Write(WriteOptions(), nullptr); + if (s.ok() && options.wait) { + // Wait until the compaction completes + s = WaitForFlushMemTable(); + } + return s; +} + +Status DBImpl::WaitForFlushMemTable() { + Status s; + // Wait until the compaction completes + MutexLock l(&mutex_); + while (imm_.size() > 0 && bg_error_.ok()) { + bg_cv_.Wait(); + } + if (imm_.size() != 0) { + s = bg_error_; + } + return s; +} + +Status DBImpl::TEST_FlushMemTable(bool wait) { + FlushOptions fo; + fo.wait = wait; + return FlushMemTable(fo); +} + +Status DBImpl::TEST_WaitForFlushMemTable() { + return WaitForFlushMemTable(); +} + +Status DBImpl::TEST_WaitForCompact() { + // Wait until the compaction completes + + // TODO: a bug here. This function actually does not necessarily + // wait for compact. It actually waits for scheduled compaction + // OR flush to finish. + + MutexLock l(&mutex_); + while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && + bg_error_.ok()) { + bg_cv_.Wait(); + } + return bg_error_; +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + bg_schedule_needed_ = false; + if (bg_work_gate_closed_) { + // gate closed for backgrond work + } else if (shutting_down_.Acquire_Load()) { + // DB is being deleted; no more background compactions + } else { + bool is_flush_pending = imm_.IsFlushPending(); + if (is_flush_pending) { + if (bg_flush_scheduled_ < options_.max_background_flushes) { + // memtable flush needed + bg_flush_scheduled_++; + env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); + } else if (options_.max_background_flushes > 0) { + bg_schedule_needed_ = true; + } + } + + // Schedule BGWorkCompaction if there's a compaction pending (or a memtable + // flush, but the HIGH pool is not enabled). Do it only if + // max_background_compactions hasn't been reached and, in case + // bg_manual_only_ > 0, if it's a manual compaction. + if ((manual_compaction_ || + versions_->current()->NeedsCompaction() || + (is_flush_pending && (options_.max_background_flushes <= 0))) && + (!bg_manual_only_ || manual_compaction_)) { + if (bg_compaction_scheduled_ < options_.max_background_compactions) { + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); + } else { + bg_schedule_needed_ = true; + } + } + } +} + +void DBImpl::BGWorkFlush(void* db) { + reinterpret_cast(db)->BackgroundCallFlush(); +} + +void DBImpl::BGWorkCompaction(void* db) { + reinterpret_cast(db)->BackgroundCallCompaction(); +} + +Status DBImpl::BackgroundFlush(bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + Status stat; + while (stat.ok() && imm_.IsFlushPending()) { + LogToBuffer(log_buffer, + "BackgroundCallFlush doing FlushMemTableToOutputFile, " + "flush slots available %d", + options_.max_background_flushes - bg_flush_scheduled_); + stat = FlushMemTableToOutputFile(madeProgress, deletion_state, log_buffer); + } + return stat; +} + +void DBImpl::BackgroundCallFlush() { + bool madeProgress = false; + DeletionState deletion_state(true); + assert(bg_flush_scheduled_); + + LogBuffer log_buffer(INFO, options_.info_log.get()); + { + MutexLock l(&mutex_); + + Status s; + if (!shutting_down_.Acquire_Load()) { + s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + Log(options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + // If !s.ok(), this means that Flush failed. In that case, we want + // to delete all obsolete files and we force FindObsoleteFiles() + FindObsoleteFiles(deletion_state, !s.ok()); + // delete unnecessary files if any, this is done outside the mutex + if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + mutex_.Lock(); + } + + bg_flush_scheduled_--; + // Any time the mutex is released After finding the work to do, another + // thread might execute MaybeScheduleFlushOrCompaction(). It is possible + // that there is a pending job but it is not scheduled because of the + // max thread limit. + if (madeProgress || bg_schedule_needed_) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + + +void DBImpl::TEST_PurgeObsoleteteWAL() { + PurgeObsoleteWALFiles(); +} + +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + MutexLock l(&mutex_); + return versions_->current()->NumLevelBytes(0); +} + +void DBImpl::BackgroundCallCompaction() { + bool madeProgress = false; + DeletionState deletion_state(true); + + MaybeDumpStats(); + LogBuffer log_buffer(INFO, options_.info_log.get()); + { + MutexLock l(&mutex_); + // Log(options_.info_log, "XXX BG Thread %llx process new work item", + // pthread_self()); + assert(bg_compaction_scheduled_); + Status s; + if (!shutting_down_.Acquire_Load()) { + s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer); + if (!s.ok()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + Log(options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + } + + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because deletion_state does not catch + // all created files if compaction failed. + FindObsoleteFiles(deletion_state, !s.ok()); + + // delete unnecessary files if any, this is done outside the mutex + if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + mutex_.Lock(); + } + + bg_compaction_scheduled_--; + + MaybeScheduleLogDBDeployStats(); + + // Previous compaction may have produced too many files in a level, + // So reschedule another compaction if we made progress in the + // last compaction. + // + // Also, any time the mutex is released After finding the work to do, + // another thread might execute MaybeScheduleFlushOrCompaction(). It is + // possible that there is a pending job but it is not scheduled because of + // the max thread limit. + if (madeProgress || bg_schedule_needed_) { + MaybeScheduleFlushOrCompaction(); + } + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + *madeProgress = false; + mutex_.AssertHeld(); + + bool is_manual = (manual_compaction_ != nullptr) && + (manual_compaction_->in_progress == false); + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction_->in_progress = true; + } + + // TODO: remove memtable flush from formal compaction + while (imm_.IsFlushPending()) { + LogToBuffer(log_buffer, + "BackgroundCompaction doing FlushMemTableToOutputFile, " + "compaction slots " + "available %d", + options_.max_background_compactions - bg_compaction_scheduled_); + Status stat = FlushMemTableToOutputFile(madeProgress, deletion_state, + log_buffer); + if (!stat.ok()) { + if (is_manual) { + manual_compaction_->status = stat; + manual_compaction_->done = true; + manual_compaction_->in_progress = false; + manual_compaction_ = nullptr; + } + return stat; + } + } + + unique_ptr c; + InternalKey manual_end_storage; + InternalKey* manual_end = &manual_end_storage; + if (is_manual) { + ManualCompaction* m = manual_compaction_; + assert(m->in_progress); + c.reset(versions_->CompactRange( + m->input_level, m->output_level, m->begin, m->end, &manual_end)); + if (!c) { + m->done = true; + } + LogToBuffer( + log_buffer, + "Manual compaction from level-%d to level-%d from %s .. %s; will stop " + "at %s\n", + m->input_level, m->output_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || manual_end == nullptr) + ? "(end)" + : manual_end->DebugString().c_str())); + } else if (!options_.disable_auto_compactions) { + c.reset(versions_->PickCompaction(log_buffer)); + } + + Status status; + if (!c) { + // Nothing to do + LogToBuffer(log_buffer, "Compaction nothing to do"); + } else if (!is_manual && c->IsTrivialMove()) { + // Move file to next level + assert(c->num_input_files(0) == 1); + FileMetaData* f = c->input(0, 0); + c->edit()->DeleteFile(c->level(), f->number); + c->edit()->AddFile(c->level() + 1, f->number, f->file_size, + f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + status = versions_->LogAndApply(c->edit(), &mutex_, db_directory_.get()); + InstallSuperVersion(deletion_state); + Version::LevelSummaryStorage tmp; + LogToBuffer(log_buffer, "Moved #%lld to level-%d %lld bytes %s: %s\n", + static_cast(f->number), c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str(), + versions_->current()->LevelSummary(&tmp)); + versions_->ReleaseCompactionFiles(c.get(), status); + *madeProgress = true; + } else { + MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel. + CompactionState* compact = new CompactionState(c.get()); + status = DoCompactionWork(compact, deletion_state, log_buffer); + CleanupCompaction(compact, status); + versions_->ReleaseCompactionFiles(c.get(), status); + c->ReleaseInputs(); + *madeProgress = true; + } + c.reset(); + + if (status.ok()) { + // Done + } else if (shutting_down_.Acquire_Load()) { + // Ignore compaction errors found during shutting down + } else { + Log(WARN, options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + if (options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } + + if (is_manual) { + ManualCompaction* m = manual_compaction_; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal compaction should always compact the whole range + assert(options_.compaction_style != kCompactionStyleUniversal); + m->tmp_storage = *manual_end; + m->begin = &m->tmp_storage; + } + m->in_progress = false; // not being processed anymore + manual_compaction_ = nullptr; + } + return status; +} + +void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { + mutex_.AssertHeld(); + if (compact->builder != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + compact->builder->Abandon(); + compact->builder.reset(); + } else { + assert(compact->outfile == nullptr); + } + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + pending_outputs_.erase(out.number); + + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + if (!status.ok()) { + table_cache_->Evict(out.number); + } + } + delete compact; +} + +// Allocate the file numbers for the output file. We allocate as +// many output file numbers as there are files in level+1 (at least one) +// Insert them into pending_outputs so that they do not get deleted. +void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { + mutex_.AssertHeld(); + assert(compact != nullptr); + assert(compact->builder == nullptr); + int filesNeeded = compact->compaction->num_input_files(1); + for (int i = 0; i < std::max(filesNeeded, 1); i++) { + uint64_t file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + compact->allocated_file_numbers.push_back(file_number); + } +} + +// Frees up unused file number. +void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) { + mutex_.AssertHeld(); + for (const auto file_number : compact->allocated_file_numbers) { + pending_outputs_.erase(file_number); + // Log(options_.info_log, "XXX releasing unused file num %d", file_number); + } +} + +Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { + assert(compact != nullptr); + assert(compact->builder == nullptr); + uint64_t file_number; + // If we have not yet exhausted the pre-allocated file numbers, + // then use the one from the front. Otherwise, we have to acquire + // the heavyweight lock and allocate a new file number. + if (!compact->allocated_file_numbers.empty()) { + file_number = compact->allocated_file_numbers.front(); + compact->allocated_file_numbers.pop_front(); + } else { + mutex_.Lock(); + file_number = versions_->NewFileNumber(); + pending_outputs_.insert(file_number); + mutex_.Unlock(); + } + CompactionState::Output out; + out.number = file_number; + out.smallest.Clear(); + out.largest.Clear(); + out.smallest_seqno = out.largest_seqno = 0; + compact->outputs.push_back(out); + + // Make the output file + std::string fname = TableFileName(dbname_, file_number); + Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); + + if (s.ok()) { + // Over-estimate slightly so we don't end up just barely crossing + // the threshold. + compact->outfile->SetPreallocationBlockSize( + 1.1 * versions_->MaxFileSizeForLevel(compact->compaction->output_level())); + + CompressionType compression_type = GetCompressionType( + options_, compact->compaction->output_level(), + compact->compaction->enable_compression()); + + compact->builder.reset(NewTableBuilder(options_, internal_comparator_, + compact->outfile.get(), + compression_type)); + } + LogFlush(options_.info_log); + return s; +} + +Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, + Iterator* input) { + assert(compact != nullptr); + assert(compact->outfile); + assert(compact->builder != nullptr); + + const uint64_t output_number = compact->current_output()->number; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact->builder->NumEntries(); + if (s.ok()) { + s = compact->builder->Finish(); + } else { + compact->builder->Abandon(); + } + const uint64_t current_bytes = compact->builder->FileSize(); + compact->current_output()->file_size = current_bytes; + compact->total_bytes += current_bytes; + compact->builder.reset(); + + // Finish and check for file errors + if (s.ok() && !options_.disableDataSync) { + if (options_.use_fsync) { + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS, false); + s = compact->outfile->Fsync(); + } else { + StopWatch sw(env_, options_.statistics.get(), + COMPACTION_OUTFILE_SYNC_MICROS, false); + s = compact->outfile->Sync(); + } + } + if (s.ok()) { + s = compact->outfile->Close(); + } + compact->outfile.reset(); + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + FileMetaData meta(output_number, current_bytes); + Iterator* iter = table_cache_->NewIterator(ReadOptions(), storage_options_, + internal_comparator_, meta); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(options_.info_log, + "Generated table #%lu: %lu keys, %lu bytes", + (unsigned long) output_number, + (unsigned long) current_entries, + (unsigned long) current_bytes); + } + } + return s; +} + + +Status DBImpl::InstallCompactionResults(CompactionState* compact, + LogBuffer* log_buffer) { + mutex_.AssertHeld(); + + // paranoia: verify that the files that we started with + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { + Log(options_.info_log, "Compaction %d@%d + %d@%d files aborted", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->level() + 1); + return Status::Corruption("Compaction input files inconsistent"); + } + + LogToBuffer( + log_buffer, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), compact->compaction->level(), + compact->compaction->num_input_files(1), compact->compaction->level() + 1, + static_cast(compact->total_bytes)); + + // Add compaction outputs + compact->compaction->AddInputDeletions(compact->compaction->edit()); + for (size_t i = 0; i < compact->outputs.size(); i++) { + const CompactionState::Output& out = compact->outputs[i]; + compact->compaction->edit()->AddFile( + compact->compaction->output_level(), out.number, out.file_size, + out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); + } + return versions_->LogAndApply(compact->compaction->edit(), &mutex_, + db_directory_.get()); +} + +// +// Given a sequence number, return the sequence number of the +// earliest snapshot that this sequence number is visible in. +// The snapshots themselves are arranged in ascending order of +// sequence numbers. +// Employ a sequential search because the total number of +// snapshots are typically small. +inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( + SequenceNumber in, std::vector& snapshots, + SequenceNumber* prev_snapshot) { + SequenceNumber prev __attribute__((unused)) = 0; + for (const auto cur : snapshots) { + assert(prev <= cur); + if (cur >= in) { + *prev_snapshot = prev; + return cur; + } + prev = cur; // assignment + assert(prev); + } + Log(options_.info_log, + "Looking for seqid %lu but maxseqid is %lu", + (unsigned long)in, + (unsigned long)snapshots[snapshots.size()-1]); + assert(0); + return 0; +} + +Status DBImpl::ProcessKeyValueCompaction( + SequenceNumber visible_at_tip, + SequenceNumber earliest_snapshot, + SequenceNumber latest_snapshot, + DeletionState& deletion_state, + bool bottommost_level, + int64_t& imm_micros, + Iterator* input, + CompactionState* compact, + bool is_compaction_v2, + LogBuffer* log_buffer) { + size_t combined_idx = 0; + Status status; + std::string compaction_filter_value; + ParsedInternalKey ikey; + IterKey current_user_key; + bool has_current_user_key = false; + IterKey delete_key; + SequenceNumber last_sequence_for_key __attribute__((unused)) = + kMaxSequenceNumber; + SequenceNumber visible_in_snapshot = kMaxSequenceNumber; + MergeHelper merge(user_comparator(), options_.merge_operator.get(), + options_.info_log.get(), + options_.min_partial_merge_operands, + false /* internal key corruption is expected */); + auto compaction_filter = options_.compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (!compaction_filter) { + auto context = compact->GetFilterContextV1(); + compaction_filter_from_factory = + options_.compaction_filter_factory->CreateCompactionFilter(context); + compaction_filter = compaction_filter_from_factory.get(); + } + + for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Prioritize immutable compaction work + // TODO: remove memtable flush from normal compaction work + if (imm_.imm_flush_needed.NoBarrier_Load() != nullptr) { + const uint64_t imm_start = env_->NowMicros(); + LogFlush(options_.info_log); + mutex_.Lock(); + if (imm_.IsFlushPending()) { + FlushMemTableToOutputFile(nullptr, deletion_state, log_buffer); + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + imm_micros += (env_->NowMicros() - imm_start); + } + + Slice key; + Slice value; + // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch. + // This prefix batch should contain results after calling + // compaction_filter_v2. + // + // If is_compaction_v2 is off, this function will go through all the + // kv-pairs in input. + if (!is_compaction_v2) { + key = input->key(); + value = input->value(); + } else { + if (combined_idx >= compact->combined_key_buf_.size()) { + break; + } + assert(combined_idx < compact->combined_key_buf_.size()); + key = compact->combined_key_buf_[combined_idx]; + value = compact->combined_value_buf_[combined_idx]; + + ++combined_idx; + } + + if (compact->compaction->ShouldStopBefore(key) && + compact->builder != nullptr) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + bool current_entry_is_merging = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + // TODO: error key stays in db forever? Figure out the intention/rationale + // v10 error v8 : we cannot hide v8 even though it's pretty obvious. + current_user_key.Clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + user_comparator()->Compare(ikey.user_key, + current_user_key.GetKey()) != 0) { + // First occurrence of this user key + current_user_key.SetUserKey(ikey.user_key); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + // apply the compaction filter to the first occurrence of the user key + if (compaction_filter && !is_compaction_v2 && + ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + bool value_changed = false; + compaction_filter_value.clear(); + bool to_delete = + compaction_filter->Filter(compact->compaction->level(), + ikey.user_key, value, + &compaction_filter_value, + &value_changed); + if (to_delete) { + // make a copy of the original key and convert it to a delete + delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence, + kTypeDeletion); + // anchor the key again + key = delete_key.GetKey(); + // needed because ikey is backed by key + ParseInternalKey(key, &ikey); + // no value associated with delete + value.clear(); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER); + } else if (value_changed) { + value = compaction_filter_value; + } + } + + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find + // the earlist snapshot that is affected by this kv. + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + SequenceNumber visible = visible_at_tip ? + visible_at_tip : + findEarliestVisibleSnapshot(ikey.sequence, + compact->existing_snapshots, + &prev_snapshot); + + if (visible_in_snapshot == visible) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibily of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // TODO: why not > ? + assert(last_sequence_for_key >= ikey.sequence); + drop = true; // (A) + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_NEWER_ENTRY); + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= earliest_snapshot && + compact->compaction->IsBaseLevelForKey(ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_OBSOLETE); + } else if (ikey.type == kTypeMerge) { + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. Turn out this + // logic could also be nicely re-used for memtable flush purge + // optimization in BuildTable. + int steps = 0; + merge.MergeUntil(input, prev_snapshot, bottommost_level, + options_.statistics.get(), &steps); + // Skip the Merge ops + combined_idx = combined_idx - 1 + steps; + + current_entry_is_merging = true; + if (merge.IsSuccess()) { + // Successfully found Put/Delete/(end-of-key-range) while merging + // Get the merge result + key = merge.key(); + ParseInternalKey(key, &ikey); + value = merge.value(); + } else { + // Did not find a Put/Delete/(end-of-key-range) while merging + // We now have some stack of merge operands to write out. + // NOTE: key,value, and ikey are now referring to old entries. + // These will be correctly set below. + assert(!merge.keys().empty()); + assert(merge.keys().size() == merge.values().size()); + + // Hack to make sure last_sequence_for_key is correct + ParseInternalKey(merge.keys().front(), &ikey); + } + } + + last_sequence_for_key = ikey.sequence; + visible_in_snapshot = visible; + } +#if 0 + Log(options_.info_log, + " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " + "%d smallest_snapshot: %d level: %d bottommost %d", + ikey.user_key.ToString().c_str(), + (int)ikey.sequence, ikey.type, kTypeValue, drop, + compact->compaction->IsBaseLevelForKey(ikey.user_key), + (int)last_sequence_for_key, (int)earliest_snapshot, + compact->compaction->level(), bottommost_level); +#endif + + if (!drop) { + // We may write a single key (e.g.: for Put/Delete or successful merge). + // Or we may instead have to write a sequence/list of keys. + // We have to write a sequence iff we have an unsuccessful merge + bool has_merge_list = current_entry_is_merging && !merge.IsSuccess(); + const std::deque* keys = nullptr; + const std::deque* values = nullptr; + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + if (has_merge_list) { + keys = &merge.keys(); + values = &merge.values(); + key_iter = keys->rbegin(); // The back (*rbegin()) is the first key + value_iter = values->rbegin(); + + key = Slice(*key_iter); + value = Slice(*value_iter); + } + + // If we have a list of keys to write, traverse the list. + // If we have a single key to write, simply write that key. + while (true) { + // Invariant: key,value,ikey will always be the next entry to write + char* kptr = (char*)key.data(); + std::string kstr; + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // then we can squash the seqno to zero. + if (bottommost_level && ikey.sequence < earliest_snapshot && + ikey.type != kTypeMerge) { + assert(ikey.type != kTypeDeletion); + // make a copy because updating in place would cause problems + // with the priority queue that is managing the input key iterator + kstr.assign(key.data(), key.size()); + kptr = (char *)kstr.c_str(); + UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type); + } + + Slice newkey(kptr, key.size()); + assert((key.clear(), 1)); // we do not need 'key' anymore + + // Open output file if necessary + if (compact->builder == nullptr) { + status = OpenCompactionOutputFile(compact); + if (!status.ok()) { + break; + } + } + + SequenceNumber seqno = GetInternalKeySeqno(newkey); + if (compact->builder->NumEntries() == 0) { + compact->current_output()->smallest.DecodeFrom(newkey); + compact->current_output()->smallest_seqno = seqno; + } else { + compact->current_output()->smallest_seqno = + std::min(compact->current_output()->smallest_seqno, seqno); + } + compact->current_output()->largest.DecodeFrom(newkey); + compact->builder->Add(newkey, value); + compact->current_output()->largest_seqno = + std::max(compact->current_output()->largest_seqno, seqno); + + // Close output file if it is big enough + if (compact->builder->FileSize() >= + compact->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(compact, input); + if (!status.ok()) { + break; + } + } + + // If we have a list of entries, move to next element + // If we only had one entry, then break the loop. + if (has_merge_list) { + ++key_iter; + ++value_iter; + + // If at end of list + if (key_iter == keys->rend() || value_iter == values->rend()) { + // Sanity Check: if one ends, then both end + assert(key_iter == keys->rend() && value_iter == values->rend()); + break; + } + + // Otherwise not at end of list. Update key, value, and ikey. + key = Slice(*key_iter); + value = Slice(*value_iter); + ParseInternalKey(key, &ikey); + + } else{ + // Only had one item to begin with (Put/Delete) + break; + } + } + } + + // MergeUntil has moved input to the next entry + if (!current_entry_is_merging) { + input->Next(); + } + } + + return status; +} + +void DBImpl::CallCompactionFilterV2(CompactionState* compact, + CompactionFilterV2* compaction_filter_v2) { + if (compact == nullptr || compaction_filter_v2 == nullptr) { + return; + } + + std::vector user_key_buf; + for (const auto& key : compact->ikey_buf_) { + user_key_buf.emplace_back(key.user_key); + } + + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + compact->to_delete_buf_ = compaction_filter_v2->Filter( + compact->compaction->level(), + user_key_buf, compact->existing_value_buf_, + &compact->new_value_buf_, + &compact->value_changed_buf_); + + // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all + // kv-pairs in this compaction run needs to be deleted. + assert(compact->to_delete_buf_.size() == + compact->key_buf_.size()); + assert(compact->to_delete_buf_.size() == + compact->existing_value_buf_.size()); + assert(compact->to_delete_buf_.size() == + compact->value_changed_buf_.size()); + + int new_value_idx = 0; + for (unsigned int i = 0; i < compact->to_delete_buf_.size(); ++i) { + if (compact->to_delete_buf_[i]) { + // update the string buffer directly + // the Slice buffer points to the updated buffer + UpdateInternalKey(&compact->key_str_buf_[i][0], + compact->key_str_buf_[i].size(), + compact->ikey_buf_[i].sequence, + kTypeDeletion); + + // no value associated with delete + compact->existing_value_buf_[i].clear(); + RecordTick(options_.statistics.get(), COMPACTION_KEY_DROP_USER); + } else if (compact->value_changed_buf_[i]) { + compact->existing_value_buf_[i] = + Slice(compact->new_value_buf_[new_value_idx++]); + } + } // for +} + +Status DBImpl::DoCompactionWork(CompactionState* compact, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + assert(compact); + compact->CleanupBatchBuffer(); + compact->CleanupMergedBuffer(); + bool prefix_initialized = false; + + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + LogToBuffer(log_buffer, + "Compacting %d@%d + %d@%d files, score %.2f slots available %d", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->output_level(), compact->compaction->score(), + options_.max_background_compactions - bg_compaction_scheduled_); + char scratch[2345]; + compact->compaction->Summary(scratch, sizeof(scratch)); + LogToBuffer(log_buffer, "Compaction start summary: %s\n", scratch); + + assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0); + assert(compact->builder == nullptr); + assert(!compact->outfile); + + SequenceNumber visible_at_tip = 0; + SequenceNumber earliest_snapshot; + SequenceNumber latest_snapshot = 0; + snapshots_.getAll(compact->existing_snapshots); + if (compact->existing_snapshots.size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip = versions_->LastSequence(); + earliest_snapshot = visible_at_tip; + } else { + latest_snapshot = compact->existing_snapshots.back(); + // Add the current seqno as the 'latest' virtual + // snapshot to the end of this list. + compact->existing_snapshots.push_back(versions_->LastSequence()); + earliest_snapshot = compact->existing_snapshots[0]; + } + + // Is this compaction producing files at the bottommost level? + bool bottommost_level = compact->compaction->BottomMostLevel(); + + // Allocate the output file numbers before we release the lock + AllocateCompactionOutputFileNumbers(compact); + + // Release mutex while we're actually doing the compaction work + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + + const uint64_t start_micros = env_->NowMicros(); + unique_ptr input(versions_->MakeInputIterator(compact->compaction)); + input->SeekToFirst(); + shared_ptr backup_input( + versions_->MakeInputIterator(compact->compaction)); + backup_input->SeekToFirst(); + + Status status; + ParsedInternalKey ikey; + std::unique_ptr compaction_filter_from_factory_v2 + = nullptr; + auto context = compact->GetFilterContext(); + compaction_filter_from_factory_v2 = + options_.compaction_filter_factory_v2->CreateCompactionFilterV2(context); + auto compaction_filter_v2 = + compaction_filter_from_factory_v2.get(); + + // temp_backup_input always point to the start of the current buffer + // temp_backup_input = backup_input; + // iterate through input, + // 1) buffer ineligible keys and value keys into 2 separate buffers; + // 2) send value_buffer to compaction filter and alternate the values; + // 3) merge value_buffer with ineligible_value_buffer; + // 4) run the modified "compaction" using the old for loop. + if (compaction_filter_v2) { + for (; backup_input->Valid() && !shutting_down_.Acquire_Load(); ) { + // Prioritize immutable compaction work + if (imm_.imm_flush_needed.NoBarrier_Load() != nullptr) { + const uint64_t imm_start = env_->NowMicros(); + LogFlush(options_.info_log); + mutex_.Lock(); + if (imm_.IsFlushPending()) { + FlushMemTableToOutputFile(nullptr, deletion_state, log_buffer); + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + imm_micros += (env_->NowMicros() - imm_start); + } + + Slice key = backup_input->key(); + Slice value = backup_input->value(); + + const SliceTransform* transformer = + options_.compaction_filter_factory_v2->GetPrefixExtractor(); + std::string key_prefix = transformer->Transform(key).ToString(); + if (!prefix_initialized) { + compact->cur_prefix_ = key_prefix; + prefix_initialized = true; + } + if (!ParseInternalKey(key, &ikey)) { + // log error + Log(options_.info_log, "Failed to parse key: %s", + key.ToString().c_str()); + continue; + } else { + // If the prefix remains the same, keep buffering + if (key_prefix == compact->cur_prefix_) { + // Apply the compaction filter V2 to all the kv pairs sharing + // the same prefix + if (ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + // Buffer all keys sharing the same prefix for CompactionFilterV2 + // Iterate through keys to check prefix + compact->BufferKeyValueSlices(key, value); + } else { + // buffer ineligible keys + compact->BufferOtherKeyValueSlices(key, value); + } + backup_input->Next(); + continue; + // finish changing values for eligible keys + } else { + // Now prefix changes, this batch is done. + // Call compaction filter on the buffered values to change the value + if (compact->key_buf_.size() > 0) { + CallCompactionFilterV2(compact, compaction_filter_v2); + } + compact->cur_prefix_ = key_prefix; + } + } + + // Merge this batch of data (values + ineligible keys) + compact->MergeKeyValueSliceBuffer(&internal_comparator_); + + // Done buffering for the current prefix. Spit it out to disk + // Now just iterate through all the kv-pairs + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + true, + log_buffer); + + if (!status.ok()) { + break; + } + + // After writing the kv-pairs, we can safely remove the reference + // to the string buffer and clean them up + compact->CleanupBatchBuffer(); + compact->CleanupMergedBuffer(); + // Buffer the key that triggers the mismatch in prefix + if (ikey.type == kTypeValue && + (visible_at_tip || ikey.sequence > latest_snapshot)) { + compact->BufferKeyValueSlices(key, value); + } else { + compact->BufferOtherKeyValueSlices(key, value); + } + backup_input->Next(); + if (!backup_input->Valid()) { + // If this is the single last value, we need to merge it. + if (compact->key_buf_.size() > 0) { + CallCompactionFilterV2(compact, compaction_filter_v2); + } + compact->MergeKeyValueSliceBuffer(&internal_comparator_); + + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + true, + log_buffer); + + compact->CleanupBatchBuffer(); + compact->CleanupMergedBuffer(); + } + } // done processing all prefix batches + // finish the last batch + if (compact->key_buf_.size() > 0) { + CallCompactionFilterV2(compact, compaction_filter_v2); + } + compact->MergeKeyValueSliceBuffer(&internal_comparator_); + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + true, + log_buffer); + } // checking for compaction filter v2 + + if (!compaction_filter_v2) { + status = ProcessKeyValueCompaction( + visible_at_tip, + earliest_snapshot, + latest_snapshot, + deletion_state, + bottommost_level, + imm_micros, + input.get(), + compact, + false, + log_buffer); + } + + if (status.ok() && shutting_down_.Acquire_Load()) { + status = Status::ShutdownInProgress( + "Database shutdown started during compaction"); + } + if (status.ok() && compact->builder != nullptr) { + status = FinishCompactionOutputFile(compact, input.get()); + } + if (status.ok()) { + status = input->status(); + } + input.reset(); + + if (!options_.disableDataSync) { + db_directory_->Fsync(); + } + + InternalStats::CompactionStats stats; + stats.micros = env_->NowMicros() - start_micros - imm_micros; + MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros); + stats.files_in_leveln = compact->compaction->num_input_files(0); + stats.files_in_levelnp1 = compact->compaction->num_input_files(1); + + int num_output_files = compact->outputs.size(); + if (compact->builder != nullptr) { + // An error occurred so ignore the last output. + assert(num_output_files > 0); + --num_output_files; + } + stats.files_out_levelnp1 = num_output_files; + + for (int i = 0; i < compact->compaction->num_input_files(0); i++) { + stats.bytes_readn += compact->compaction->input(0, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(0, i)->file_size); + } + + for (int i = 0; i < compact->compaction->num_input_files(1); i++) { + stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size; + RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, + compact->compaction->input(1, i)->file_size); + } + + for (int i = 0; i < num_output_files; i++) { + stats.bytes_written += compact->outputs[i].file_size; + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + compact->outputs[i].file_size); + } + + LogFlush(options_.info_log); + mutex_.Lock(); + internal_stats_.AddCompactionStats(compact->compaction->output_level(), + stats); + + // if there were any unused file number (mostly in case of + // compaction error), free up the entry from pending_putputs + ReleaseCompactionUnusedFileNumbers(compact); + + if (status.ok()) { + status = InstallCompactionResults(compact, log_buffer); + InstallSuperVersion(deletion_state); + } + Version::LevelSummaryStorage tmp; + LogToBuffer( + log_buffer, + "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " + "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " + "write-amplify(%.1f) %s\n", + versions_->current()->LevelSummary(&tmp), + (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / + (double)stats.micros, + compact->compaction->output_level(), stats.files_in_leveln, + stats.files_in_levelnp1, stats.files_out_levelnp1, + stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, + stats.bytes_written / 1048576.0, + (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / + (double)stats.bytes_readn, + stats.bytes_written / (double)stats.bytes_readn, + status.ToString().c_str()); + + return status; +} + +namespace { +struct IterState { + IterState(DBImpl* db, port::Mutex* mu, DBImpl::SuperVersion* super_version) + : db(db), mu(mu), super_version(super_version) {} + + DBImpl* db; + port::Mutex* mu; + DBImpl::SuperVersion* super_version; +}; + +static void CleanupIteratorState(void* arg1, void* arg2) { + IterState* state = reinterpret_cast(arg1); + + bool need_cleanup = state->super_version->Unref(); + if (need_cleanup) { + DBImpl::DeletionState deletion_state; + + state->mu->Lock(); + state->super_version->Cleanup(); + state->db->FindObsoleteFiles(deletion_state, false, true); + state->mu->Unlock(); + + delete state->super_version; + if (deletion_state.HaveSomethingToDelete()) { + state->db->PurgeObsoleteFiles(deletion_state); + } + } + + delete state; +} +} // namespace + +Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, + SequenceNumber* latest_snapshot) { + mutex_.Lock(); + *latest_snapshot = versions_->LastSequence(); + SuperVersion* super_version = super_version_->Ref(); + mutex_.Unlock(); + + std::vector iterator_list; + // Collect iterator for mutable mem + iterator_list.push_back(super_version->mem->NewIterator(options)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(options, &iterator_list); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(options, storage_options_, + &iterator_list); + Iterator* internal_iter = NewMergingIterator( + env_, &internal_comparator_, &iterator_list[0], iterator_list.size()); + + IterState* cleanup = new IterState(this, &mutex_, super_version); + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); + + return internal_iter; +} + +Iterator* DBImpl::TEST_NewInternalIterator() { + SequenceNumber ignored; + ReadOptions read_options; + // Use prefix_seek to make the test function more useful. + read_options.prefix_seek = true; + return NewInternalIterator(read_options, &ignored); +} + +std::pair DBImpl::GetTailingIteratorPair( + const ReadOptions& options, + uint64_t* superversion_number) { + + mutex_.Lock(); + SuperVersion* super_version = super_version_->Ref(); + if (superversion_number != nullptr) { + *superversion_number = CurrentVersionNumber(); + } + mutex_.Unlock(); + + Iterator* mutable_iter = super_version->mem->NewIterator(options); + // create a DBIter that only uses memtable content; see NewIterator() + mutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(), + mutable_iter, kMaxSequenceNumber); + + std::vector list; + super_version->imm->AddIterators(options, &list); + super_version->current->AddIterators(options, storage_options_, &list); + Iterator* immutable_iter = + NewMergingIterator(env_, &internal_comparator_, &list[0], list.size()); + + // create a DBIter that only uses memtable content; see NewIterator() + immutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(), + immutable_iter, kMaxSequenceNumber); + + // register cleanups + mutable_iter->RegisterCleanup(CleanupIteratorState, + new IterState(this, &mutex_, super_version), nullptr); + + // bump the ref one more time since it will be Unref'ed twice + immutable_iter->RegisterCleanup(CleanupIteratorState, + new IterState(this, &mutex_, super_version->Ref()), nullptr); + + return std::make_pair(mutable_iter, immutable_iter); +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { + MutexLock l(&mutex_); + return versions_->current()->MaxNextLevelOverlappingBytes(); +} + +Status DBImpl::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + return GetImpl(options, key, value); +} + +// DeletionState gets created and destructed outside of the lock -- we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersion() gets called twice with the same, +// deletion_state, we can't reuse the SuperVersion() that got malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. +void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { + mutex_.AssertHeld(); + // if new_superversion == nullptr, it means somebody already used it + SuperVersion* new_superversion = + (deletion_state.new_superversion != nullptr) ? + deletion_state.new_superversion : new SuperVersion(); + SuperVersion* old_superversion = InstallSuperVersion(new_superversion); + deletion_state.new_superversion = nullptr; + deletion_state.superversions_to_free.push_back(old_superversion); + // Reset SuperVersions cached in thread local storage + if (options_.allow_thread_local) { + ResetThreadLocalSuperVersions(&deletion_state); + } +} + +DBImpl::SuperVersion* DBImpl::InstallSuperVersion( + SuperVersion* new_superversion) { + mutex_.AssertHeld(); + new_superversion->db = this; + new_superversion->Init(mem_, imm_.current(), versions_->current()); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + ++super_version_number_; + super_version_->version_number = super_version_number_; + + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + +void DBImpl::ResetThreadLocalSuperVersions(DeletionState* deletion_state) { + mutex_.AssertHeld(); + autovector sv_ptrs; + local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); + for (auto ptr : sv_ptrs) { + assert(ptr); + if (ptr == SuperVersion::kSVInUse) { + continue; + } + auto sv = static_cast(ptr); + if (static_cast(ptr)->Unref()) { + sv->Cleanup(); + deletion_state->superversions_to_free.push_back(sv); + } + } +} + +Status DBImpl::GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found) { + Status s; + + StopWatch sw(env_, options_.statistics.get(), DB_GET, false); + StopWatchNano snapshot_timer(env_, false); + StartPerfTimer(&snapshot_timer); + SequenceNumber snapshot; + + if (options.snapshot != nullptr) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + + // Acquire SuperVersion + SuperVersion* sv = nullptr; + if (LIKELY(options_.allow_thread_local)) { + // The SuperVersion is cached in thread local storage to avoid acquiring + // mutex when SuperVersion does not change since the last use. When a new + // SuperVersion is installed, the compaction or flush thread cleans up + // cached SuperVersion in all existing thread local storage. To avoid + // acquiring mutex for this operation, we use atomic Swap() on the thread + // local pointer to guarantee exclusive access. If the thread local pointer + // is being used while a new SuperVersion is installed, the cached + // SuperVersion can become stale. In that case, the background thread would + // have swapped in kSVObsolete. We re-check the value at the end of + // Get, with an atomic compare and swap. The superversion will be released + // if detected to be stale. + void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); + // Invariant: + // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage + // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage + // should only keep kSVInUse during a GetImpl. + assert(ptr != SuperVersion::kSVInUse); + sv = static_cast(ptr); + if (sv == SuperVersion::kSVObsolete || + sv->version_number != super_version_number_.load( + std::memory_order_relaxed)) { + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES); + SuperVersion* sv_to_delete = nullptr; + + if (sv && sv->Unref()) { + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS); + mutex_.Lock(); + // TODO underlying resources held by superversion (sst files) might + // not be released until the next background job. + sv->Cleanup(); + sv_to_delete = sv; + } else { + mutex_.Lock(); + } + sv = super_version_->Ref(); + mutex_.Unlock(); + + delete sv_to_delete; + } + } else { + mutex_.Lock(); + sv = super_version_->Ref(); + mutex_.Unlock(); + } + + bool have_stat_update = false; + Version::GetStats stats; + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot); + BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); + if (sv->mem->Get(lkey, value, &s, merge_context, options_)) { + // Done + RecordTick(options_.statistics.get(), MEMTABLE_HIT); + } else if (sv->imm->Get(lkey, value, &s, merge_context, options_)) { + // Done + RecordTick(options_.statistics.get(), MEMTABLE_HIT); + } else { + StopWatchNano from_files_timer(env_, false); + StartPerfTimer(&from_files_timer); + + sv->current->Get(options, lkey, value, &s, &merge_context, &stats, + value_found); + have_stat_update = true; + BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); + RecordTick(options_.statistics.get(), MEMTABLE_MISS); + } + + StopWatchNano post_process_timer(env_, false); + StartPerfTimer(&post_process_timer); + + if (!options_.disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (sv->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + mutex_.Unlock(); + } + + bool unref_sv = true; + if (LIKELY(options_.allow_thread_local)) { + // Put the SuperVersion back + void* expected = SuperVersion::kSVInUse; + if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { + // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal + // storage has not been altered and no Scrape has happend. The + // SuperVersion is still current. + unref_sv = false; + } else { + // ThreadLocal scrape happened in the process of this GetImpl call (after + // thread local Swap() at the beginning and before CompareAndSwap()). + // This means the SuperVersion it holds is obsolete. + assert(expected == SuperVersion::kSVObsolete); + } + } + + if (unref_sv) { + // Release SuperVersion + if (sv->Unref()) { + mutex_.Lock(); + sv->Cleanup(); + mutex_.Unlock(); + delete sv; + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS); + } + RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_RELEASES); + } + + // Note, tickers are atomic now - no lock protection needed any more. + RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); + RecordTick(options_.statistics.get(), BYTES_READ, value->size()); + BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); + return s; +} + +std::vector DBImpl::MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false); + StopWatchNano snapshot_timer(env_, false); + StartPerfTimer(&snapshot_timer); + + SequenceNumber snapshot; + + mutex_.Lock(); + if (options.snapshot != nullptr) { + snapshot = reinterpret_cast(options.snapshot)->number_; + } else { + snapshot = versions_->LastSequence(); + } + + SuperVersion* get_version = super_version_->Ref(); + mutex_.Unlock(); + + bool have_stat_update = false; + Version::GetStats stats; + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + int numKeys = keys.size(); + std::vector statList(numKeys); + values->resize(numKeys); + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytesRead = 0; + BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + for (int i=0; imem->Get(lkey, value, &s, merge_context, options_)) { + // Done + } else if (get_version->imm->Get(lkey, value, &s, merge_context, + options_)) { + // Done + } else { + get_version->current->Get(options, lkey, value, &s, &merge_context, + &stats); + have_stat_update = true; + } + + if (s.ok()) { + bytesRead += value->size(); + } + } + + // Post processing (decrement reference counts and record statistics) + StopWatchNano post_process_timer(env_, false); + StartPerfTimer(&post_process_timer); + bool delete_get_version = false; + if (!options_.disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (get_version->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + if (get_version->Unref()) { + get_version->Cleanup(); + delete_get_version = true; + } + mutex_.Unlock(); + } else { + if (get_version->Unref()) { + mutex_.Lock(); + get_version->Cleanup(); + mutex_.Unlock(); + delete_get_version = true; + } + } + if (delete_get_version) { + delete get_version; + } + + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead); + BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); + + return statList; +} + +bool DBImpl::KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found) { + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + ReadOptions roptions = options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + auto s = GetImpl(roptions, key, value, value_found); + + // If options.block_cache != nullptr and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& options) { + Iterator* iter; + + if (options.tailing) { + iter = new TailingIterator(this, options, user_comparator()); + } else { + SequenceNumber latest_snapshot; + iter = NewInternalIterator(options, &latest_snapshot); + + iter = NewDBIterator( + &dbname_, env_, options_, user_comparator(), iter, + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); + } + + if (options.prefix) { + // use extra wrapper to exclude any keys from the results which + // don't begin with the prefix + iter = new PrefixFilterIterator(iter, *options.prefix, + options_.prefix_extractor.get()); + } + return iter; +} + +const Snapshot* DBImpl::GetSnapshot() { + MutexLock l(&mutex_); + return snapshots_.New(versions_->LastSequence()); +} + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + MutexLock l(&mutex_); + snapshots_.Delete(reinterpret_cast(s)); +} + +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { + return DB::Put(o, key, val); +} + +Status DBImpl::Merge(const WriteOptions& o, const Slice& key, + const Slice& val) { + if (!options_.merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, key, val); + } +} + +Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + return DB::Delete(options, key); +} + +Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { + StopWatchNano pre_post_process_timer(env_, false); + StartPerfTimer(&pre_post_process_timer); + Writer w(&mutex_); + w.batch = my_batch; + w.sync = options.sync; + w.disableWAL = options.disableWAL; + w.done = false; + + StopWatch sw(env_, options_.statistics.get(), DB_WRITE, false); + mutex_.Lock(); + writers_.push_back(&w); + while (!w.done && &w != writers_.front()) { + w.cv.Wait(); + } + + if (!options.disableWAL) { + RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1); + } + + if (w.done) { + mutex_.Unlock(); + RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); + return w.status; + } else { + RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); + } + + // May temporarily unlock and wait. + SuperVersion* superversion_to_free = nullptr; + log::Writer* old_log = nullptr; + Status status = MakeRoomForWrite(my_batch == nullptr, + &superversion_to_free, + &old_log); + uint64_t last_sequence = versions_->LastSequence(); + Writer* last_writer = &w; + if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions + autovector write_batch_group; + BuildBatchGroup(&last_writer, &write_batch_group); + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into mem_. + { + mutex_.Unlock(); + WriteBatch* updates = nullptr; + if (write_batch_group.size() == 1) { + updates = write_batch_group[0]; + } else { + updates = &tmp_batch_; + for (size_t i = 0; i < write_batch_group.size(); ++i) { + WriteBatchInternal::Append(updates, write_batch_group[i]); + } + } + + const SequenceNumber current_sequence = last_sequence + 1; + WriteBatchInternal::SetSequence(updates, current_sequence); + int my_batch_count = WriteBatchInternal::Count(updates); + last_sequence += my_batch_count; + // Record statistics + RecordTick(options_.statistics.get(), + NUMBER_KEYS_WRITTEN, my_batch_count); + RecordTick(options_.statistics.get(), + BYTES_WRITTEN, + WriteBatchInternal::ByteSize(updates)); + if (options.disableWAL) { + flush_on_destroy_ = true; + } + BumpPerfTime(&perf_context.write_pre_and_post_process_time, + &pre_post_process_timer); + + if (!options.disableWAL) { + StopWatchNano timer(env_); + StartPerfTimer(&timer); + Slice log_entry = WriteBatchInternal::Contents(updates); + status = log_->AddRecord(log_entry); + RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); + RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); + if (status.ok() && options.sync) { + if (options_.use_fsync) { + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); + status = log_->file()->Fsync(); + } else { + StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); + status = log_->file()->Sync(); + } + } + BumpPerfTime(&perf_context.write_wal_time, &timer); + } + if (status.ok()) { + StopWatchNano write_memtable_timer(env_, false); + StartPerfTimer(&write_memtable_timer); + status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, + options_.filter_deletes); + BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer); + if (!status.ok()) { + // Panic for in-memory corruptions + // Note that existing logic was not sound. Any partial failure writing + // into the memtable would result in a state that some write ops might + // have succeeded in memtable but Status reports error for all writes. + throw std::runtime_error("In memory WriteBatch corruption!"); + } + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + last_sequence); + } + StartPerfTimer(&pre_post_process_timer); + if (updates == &tmp_batch_) tmp_batch_.Clear(); + mutex_.Lock(); + if (status.ok()) { + versions_->SetLastSequence(last_sequence); + } + } + } + if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) { + bg_error_ = status; // stop compaction & fail any further writes + } + + while (true) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != &w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + mutex_.Unlock(); + delete old_log; + delete superversion_to_free; + BumpPerfTime(&perf_context.write_pre_and_post_process_time, + &pre_post_process_timer); + return status; +} + +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-nullptr batch +void DBImpl::BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + assert(first->batch != nullptr); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); + } + + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (!w->disableWAL && first->disableWAL) { + // Do not include a write that needs WAL into a batch that has + // WAL disabled. + break; + } + + if (w->batch != nullptr) { + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + write_batch_group->push_back(w->batch); + } + *last_writer = w; + } +} + +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if n < bottom, return 0; +// if n >= top, return 1000; +// otherwise, let r = (n - bottom) / +// (top - bottom) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { + uint64_t delay; + if (n >= top) { + delay = 1000; + } + else if (n < bottom) { + delay = 0; + } + else { + // If we are here, we know that: + // level0_start_slowdown <= n < level0_slowdown + // since the previous two conditions are false. + double how_much = + (double) (n - bottom) / + (top - bottom); + delay = std::max(how_much * how_much * 1000, 100.0); + } + assert(delay <= 1000); + return delay; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::MakeRoomForWrite(bool force, + SuperVersion** superversion_to_free, + log::Writer** old_log) { + mutex_.AssertHeld(); + assert(!writers_.empty()); + bool allow_delay = !force; + bool allow_hard_rate_limit_delay = !force; + bool allow_soft_rate_limit_delay = !force; + uint64_t rate_limit_delay_millis = 0; + Status s; + double score; + *superversion_to_free = nullptr; + + while (true) { + if (!bg_error_.ok()) { + // Yield previous error + s = bg_error_; + break; + } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) { + // We are getting close to hitting a hard limit on the number of + // L0 files. Rather than delaying a single write by several + // seconds when we hit the hard limit, start delaying each + // individual write by 0-1ms to reduce latency variance. Also, + // this delay hands over some CPU to the compaction thread in + // case it is sharing the same core as the writer. + uint64_t slowdown = + SlowdownAmount(versions_->current()->NumLevelFiles(0), + options_.level0_slowdown_writes_trigger, + options_.level0_stop_writes_trigger); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); + env_->SleepForMicroseconds(slowdown); + delayed = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); + internal_stats_.RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN, delayed); + allow_delay = false; // Do not delay a single write more than once + mutex_.Lock(); + delayed_writes_++; + } else if (!force && !mem_->ShouldFlush()) { + // There is room in current memtable + if (allow_delay) { + DelayLoggingAndReset(); + } + break; + } else if (imm_.size() == options_.max_write_buffer_number - 1) { + // We have filled up the current memtable, but the previous + // ones are still being flushed, so we wait. + DelayLoggingAndReset(); + Log(options_.info_log, "wait for memtable flush...\n"); + MaybeScheduleFlushOrCompaction(); + uint64_t stall; + { + StopWatch sw(env_, options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_COUNT); + bg_cv_.Wait(); + stall = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), + STALL_MEMTABLE_COMPACTION_MICROS, stall); + internal_stats_.RecordWriteStall(InternalStats::MEMTABLE_COMPACTION, + stall); + } else if (versions_->current()->NumLevelFiles(0) >= + options_.level0_stop_writes_trigger) { + // There are too many level-0 files. + DelayLoggingAndReset(); + Log(options_.info_log, "wait for fewer level0 files...\n"); + uint64_t stall; + { + StopWatch sw(env_, options_.statistics.get(), + STALL_L0_NUM_FILES_COUNT); + bg_cv_.Wait(); + stall = sw.ElapsedMicros(); + } + RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); + internal_stats_.RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, stall); + } else if (allow_hard_rate_limit_delay && options_.hard_rate_limit > 1.0 && + (score = versions_->current()->MaxCompactionScore()) > + options_.hard_rate_limit) { + // Delay a write when the compaction score for any level is too large. + int max_level = versions_->current()->MaxCompactionScoreLevel(); + mutex_.Unlock(); + uint64_t delayed; + { + StopWatch sw(env_, options_.statistics.get(), + HARD_RATE_LIMIT_DELAY_COUNT); + env_->SleepForMicroseconds(1000); + delayed = sw.ElapsedMicros(); + } + internal_stats_.RecordLevelNSlowdown(max_level, delayed); + // Make sure the following value doesn't round to zero. + uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); + rate_limit_delay_millis += rate_limit; + RecordTick(options_.statistics.get(), + RATE_LIMIT_DELAY_MILLIS, rate_limit); + if (options_.rate_limit_delay_max_milliseconds > 0 && + rate_limit_delay_millis >= + (unsigned)options_.rate_limit_delay_max_milliseconds) { + allow_hard_rate_limit_delay = false; + } + mutex_.Lock(); + } else if (allow_soft_rate_limit_delay && options_.soft_rate_limit > 0.0 && + (score = versions_->current()->MaxCompactionScore()) > + options_.soft_rate_limit) { + // Delay a write when the compaction score for any level is too large. + // TODO: add statistics + mutex_.Unlock(); + { + StopWatch sw(env_, options_.statistics.get(), + SOFT_RATE_LIMIT_DELAY_COUNT); + env_->SleepForMicroseconds(SlowdownAmount( + score, + options_.soft_rate_limit, + options_.hard_rate_limit) + ); + rate_limit_delay_millis += sw.ElapsedMicros(); + } + allow_soft_rate_limit_delay = false; + mutex_.Lock(); + + } else { + unique_ptr lfile; + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->PrevLogNumber() == 0); + uint64_t new_log_number = versions_->NewFileNumber(); + SuperVersion* new_superversion = nullptr; + mutex_.Unlock(); + { + DelayLoggingAndReset(); + s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), + &lfile, + env_->OptimizeForLogWrite(storage_options_)); + if (s.ok()) { + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); + new_log = new log::Writer(std::move(lfile)); + new_mem = new MemTable(internal_comparator_, options_); + new_superversion = new SuperVersion(); + } + Log(options_.info_log, + "New memtable created with log file: #%lu\n", + (unsigned long)new_log_number); + } + mutex_.Lock(); + if (!s.ok()) { + // Avoid chewing through file number space in a tight loop. + versions_->ReuseFileNumber(new_log_number); + assert (!new_mem); + assert(new_log == nullptr); + break; + } + logfile_number_ = new_log_number; + assert(new_log != nullptr); + *old_log = log_.release(); + log_.reset(new_log); + mem_->SetNextLogNumber(logfile_number_); + imm_.Add(mem_); + if (force) { + imm_.FlushRequested(); + } + mem_ = new_mem; + mem_->Ref(); + mem_->SetLogNumber(logfile_number_); + force = false; // Do not force another compaction if have room + MaybeScheduleFlushOrCompaction(); + *superversion_to_free = InstallSuperVersion(new_superversion); + } + } + return s; +} + +Status DBImpl::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + // Increment the ref count + mutex_.Lock(); + auto version = versions_->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfAllTables(props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +const std::string& DBImpl::GetName() const { + return dbname_; +} + +Env* DBImpl::GetEnv() const { + return env_; +} + +const Options& DBImpl::GetOptions() const { + return options_; +} + +bool DBImpl::GetProperty(const Slice& property, std::string* value) { + value->clear(); + DBPropertyType property_type = GetPropertyType(property); + MutexLock l(&mutex_); + return internal_stats_.GetProperty(property_type, property, value, this); +} + +void DBImpl::GetApproximateSizes( + const Range* range, int n, + uint64_t* sizes) { + // TODO(opt): better implementation + Version* v; + { + MutexLock l(&mutex_); + versions_->current()->Ref(); + v = versions_->current(); + } + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + uint64_t start = versions_->ApproximateOffsetOf(v, k1); + uint64_t limit = versions_->ApproximateOffsetOf(v, k2); + sizes[i] = (limit >= start ? limit - start : 0); + } + + { + MutexLock l(&mutex_); + v->Unref(); + } +} + +inline void DBImpl::DelayLoggingAndReset() { + if (delayed_writes_ > 0) { + Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); + delayed_writes_ = 0; + } +} + +Status DBImpl::DeleteFile(std::string name) { + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kLogFile)) { + Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + Status status; + if (type == kLogFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + Log(options_.info_log, "DeleteFile %s failed - not archived log.\n", + name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); + if (!status.ok()) { + Log(options_.info_log, "DeleteFile %s failed -- %s.\n", + name.c_str(), status.ToString().c_str()); + } + return status; + } + + int level; + FileMetaData* metadata; + int maxlevel = NumberLevels(); + VersionEdit edit; + DeletionState deletion_state(true); + { + MutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata); + if (!status.ok()) { + Log(options_.info_log, "DeleteFile %s failed. File not found\n", + name.c_str()); + return Status::InvalidArgument("File not found"); + } + assert((level > 0) && (level < maxlevel)); + + // If the file is being compacted no need to delete. + if (metadata->being_compacted) { + Log(options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + for (int i = level + 1; i < maxlevel; i++) { + if (versions_->current()->NumLevelFiles(i) != 0) { + Log(options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", name.c_str()); + return Status::InvalidArgument("File not in last level"); + } + } + edit.DeleteFile(level, number); + status = versions_->LogAndApply(&edit, &mutex_, db_directory_.get()); + if (status.ok()) { + InstallSuperVersion(deletion_state); + } + FindObsoleteFiles(deletion_state, false); + } // lock released here + LogFlush(options_.info_log); + // remove files outside the db-lock + if (deletion_state.HaveSomethingToDelete()) { + PurgeObsoleteFiles(deletion_state); + } + { + MutexLock l(&mutex_); + // schedule flush if file deletion means we freed the space for flushes to + // continue + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { + MutexLock l(&mutex_); + return versions_->GetLiveFilesMetaData(metadata); +} + +Status DBImpl::CheckConsistency() { + mutex_.AssertHeld(); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + std::string file_path = dbname_ + md.name; + uint64_t fsize = 0; + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + md.name + + ". Size recorded in manifest " + + std::to_string(md.size) + ", actual size " + + std::to_string(fsize) + "\n"; + } + } + if (corruption_messages.size() == 0) { + return Status::OK(); + } else { + return Status::Corruption(corruption_messages); + } +} + +void DBImpl::TEST_GetFilesMetaData( + std::vector>* metadata) { + MutexLock l(&mutex_); + metadata->resize(NumberLevels()); + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = + versions_->current()->files_[level]; + + (*metadata)[level].clear(); + for (const auto& f : files) { + (*metadata)[level].push_back(*f); + } + } +} + +Status DBImpl::GetDbIdentity(std::string& identity) { + std::string idfilename = IdentityFileName(dbname_); + unique_ptr idfile; + const EnvOptions soptions; + Status s = env_->NewSequentialFile(idfilename, &idfile, soptions); + if (!s.ok()) { + return s; + } + uint64_t file_size; + s = env_->GetFileSize(idfilename, &file_size); + if (!s.ok()) { + return s; + } + char buffer[file_size]; + Slice id; + s = idfile->Read(file_size, &id, buffer); + if (!s.ok()) { + return s; + } + identity.assign(id.ToString()); + // If last character is '\n' remove it from identity + if (identity.size() > 0 && identity.back() == '\n') { + identity.pop_back(); + } + return s; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + batch.Put(key, value); + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, const Slice& key) { + WriteBatch batch; + batch.Delete(key); + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, const Slice& key, + const Slice& value) { + WriteBatch batch; + batch.Merge(key, value); + return Write(opt, &batch); +} + +DB::~DB() { } + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + *dbptr = nullptr; + + if (options.block_cache != nullptr && options.no_block_cache) { + return Status::InvalidArgument( + "no_block_cache is true while block_cache is not nullptr"); + } + + DBImpl* impl = new DBImpl(options, dbname); + Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); + if (!s.ok()) { + delete impl; + return s; + } + + s = impl->CreateArchivalDirectory(); + if (!s.ok()) { + delete impl; + return s; + } + impl->mutex_.Lock(); + s = impl->Recover(); // Handles create_if_missing, error_if_exists + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + unique_ptr lfile; + EnvOptions soptions(options); + s = impl->options_.env->NewWritableFile( + LogFileName(impl->options_.wal_dir, new_log_number), &lfile, + impl->options_.env->OptimizeForLogWrite(soptions)); + if (s.ok()) { + lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); + VersionEdit edit; + edit.SetLogNumber(new_log_number); + impl->logfile_number_ = new_log_number; + impl->log_.reset(new log::Writer(std::move(lfile))); + s = impl->versions_->LogAndApply(&edit, &impl->mutex_, + impl->db_directory_.get()); + } + if (s.ok()) { + delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); + impl->mem_->SetLogNumber(impl->logfile_number_); + impl->DeleteObsoleteFiles(); + impl->MaybeScheduleFlushOrCompaction(); + impl->MaybeScheduleLogDBDeployStats(); + s = impl->db_directory_->Fsync(); + } + } + + if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) { + Version* current = impl->versions_->current(); + for (int i = 1; i < impl->NumberLevels(); i++) { + int num_files = current->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument("Not all files are at level 0. Cannot " + "open with universal compaction style."); + break; + } + } + } + + impl->mutex_.Unlock(); + + if (s.ok()) { + impl->opened_successfully_ = true; + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +Snapshot::~Snapshot() { +} + +Status DestroyDB(const std::string& dbname, const Options& options) { + const InternalKeyComparator comparator(options.comparator); + const InternalFilterPolicy filter_policy(options.filter_policy); + const Options& soptions(SanitizeOptions( + dbname, &comparator, &filter_policy, options)); + Env* env = soptions.env; + std::vector filenames; + std::vector archiveFiles; + + std::string archivedir = ArchivalDirectory(dbname); + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + + if (dbname != soptions.wal_dir) { + std::vector logfilenames; + env->GetChildren(soptions.wal_dir, &logfilenames); + filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end()); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + if (filenames.empty()) { + return Status::OK(); + } + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + if (type == kMetaDatabase) { + del = DestroyDB(dbname + "/" + filenames[i], options); + } else if (type == kLogFile) { + del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]); + } else { + del = env->DeleteFile(dbname + "/" + filenames[i]); + } + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + env->GetChildren(archivedir, &archiveFiles); + // Delete archival files. + for (size_t i = 0; i < archiveFiles.size(); ++i) { + if (ParseFileName(archiveFiles[i], &number, &type) && + type == kLogFile) { + Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + // ignore case where no archival directory is present. + env->DeleteDir(archivedir); + + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(lockname); + env->DeleteDir(dbname); // Ignore error in case dir contains other files + env->DeleteDir(soptions.wal_dir); + } + return result; +} + +// +// A global method that can dump out the build version +void DumpLeveldbBuildVersion(Logger * log) { + Log(log, "Git sha %s", rocksdb_build_git_sha); + Log(log, "Compile time %s %s", + rocksdb_build_compile_time, rocksdb_build_compile_date); +} + +} // namespace rocksdb diff --git a/db/db_impl.h b/db/db_impl.h new file mode 100644 index 00000000..bedd23f8 --- /dev/null +++ b/db/db_impl.h @@ -0,0 +1,626 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "db/version_edit.h" +#include "memtable_list.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "util/autovector.h" +#include "util/stats_logger.h" +#include "util/thread_local.h" +#include "db/internal_stats.h" + +namespace rocksdb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class CompactionFilterV2; + +class DBImpl : public DB { + public: + DBImpl(const Options& options, const std::string& dbname); + virtual ~DBImpl(); + + // Implementations of the DB interface + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); + virtual Status Merge(const WriteOptions&, const Slice& key, + const Slice& value); + virtual Status Delete(const WriteOptions&, const Slice& key); + virtual Status Write(const WriteOptions& options, WriteBatch* updates); + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values); + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr); + virtual Iterator* NewIterator(const ReadOptions&); + virtual const Snapshot* GetSnapshot(); + virtual void ReleaseSnapshot(const Snapshot* snapshot); + virtual bool GetProperty(const Slice& property, std::string* value); + virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + virtual Status CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1); + virtual int NumberLevels(); + virtual int MaxMemCompactionLevel(); + virtual int Level0StopWriteTrigger(); + virtual const std::string& GetName() const; + virtual Env* GetEnv() const; + virtual const Options& GetOptions() const; + virtual Status Flush(const FlushOptions& options); + virtual Status DisableFileDeletions(); + virtual Status EnableFileDeletions(bool force); + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true); + virtual Status GetSortedWalFiles(VectorLogPtr& files); + virtual SequenceNumber GetLatestSequenceNumber() const; + virtual Status GetUpdatesSince( + SequenceNumber seq_number, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& + read_options = TransactionLogIterator::ReadOptions()); + virtual Status DeleteFile(std::string name); + + virtual void GetLiveFilesMetaData( + std::vector *metadata); + + // checks if all live files exist on file system and that their file sizes + // match to our in-memory records + virtual Status CheckConsistency(); + + virtual Status GetDbIdentity(std::string& identity); + + Status RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end); + + // Extra methods (for testing) that are not in the public DB interface + + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, + const Slice* begin, + const Slice* end); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(); + + // Wait for any compaction + Status TEST_WaitForCompact(); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + Iterator* TEST_NewInternalIterator(); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes(); + + // Simulate a db crash, no elegant closing of database. + void TEST_Destroy_DBImpl(); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Trigger's a background call for testing. + void TEST_PurgeObsoleteteWAL(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) + { + default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; + } + + void TEST_GetFilesMetaData(std::vector>* metadata); + + // holds references to memtable, all immutable memtables and version + struct SuperVersion { + MemTable* mem; + MemTableListVersion* imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm->Unref() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + autovector to_delete; + // Version number of the current SuperVersion + uint64_t version_number; + DBImpl* db; + + // should be called outside the mutex + SuperVersion() = default; + ~SuperVersion(); + SuperVersion* Ref(); + // Returns true if this was the last reference and caller should + // call Clenaup() and delete the object + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current); + + // The value of dummy is not actually used. kSVInUse takes its address as a + // mark in the thread local storage to indicate the SuperVersion is in use + // by thread. This way, the value of kSVInUse is guaranteed to have no + // conflict with SuperVersion object address and portable on different + // platform. + static int dummy; + static void* const kSVInUse; + static void* const kSVObsolete; + }; + + static void SuperVersionUnrefHandle(void* ptr) { + // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets + // destroyed. When former happens, the thread shouldn't see kSVInUse. + // When latter happens, we are in ~DBImpl(), no get should happen as well. + assert(ptr != SuperVersion::kSVInUse); + DBImpl::SuperVersion* sv = static_cast(ptr); + if (sv->Unref()) { + sv->db->mutex_.Lock(); + sv->Cleanup(); + sv->db->mutex_.Unlock(); + delete sv; + } + } + + // needed for CleanupIteratorState + struct DeletionState { + inline bool HaveSomethingToDelete() const { + return candidate_files.size() || + sst_delete_files.size() || + log_delete_files.size(); + } + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + std::vector candidate_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // a list of sst files that we need to delete + std::vector sst_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of memtables to be free + autovector memtables_to_free; + + autovector superversions_to_free; + + SuperVersion* new_superversion; // if nullptr no new superversion + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number, pending_manifest_file_number, log_number, + prev_log_number; + + explicit DeletionState(bool create_superversion = false) { + manifest_file_number = 0; + pending_manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + new_superversion = + create_superversion ? new SuperVersion() : nullptr; + } + + ~DeletionState() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversions + for (auto s : superversions_to_free) { + delete s; + } + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; + } + }; + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'candidate_files'. + // If force == false and the last call was less than + // options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the deletion_state + void FindObsoleteFiles(DeletionState& deletion_state, + bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are posibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + void PurgeObsoleteFiles(DeletionState& deletion_state); + + protected: + Env* const env_; + const std::string dbname_; + unique_ptr versions_; + const InternalKeyComparator internal_comparator_; + const Options options_; // options_.comparator == &internal_comparator_ + + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + + SuperVersion* GetSuperVersion() { + return super_version_; + } + + Iterator* NewInternalIterator(const ReadOptions&, + SequenceNumber* latest_snapshot); + + private: + friend class DB; + friend class InternalStats; + friend class TailingIterator; + friend struct SuperVersion; + struct CompactionState; + struct Writer; + + Status NewDB(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. + Status Recover(bool read_only = false, bool error_if_log_file_exist = false); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. + Status FlushMemTableToOutputFile(bool* madeProgress, + DeletionState& deletion_state, + LogBuffer* log_buffer); + + Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, + bool read_only); + + // The following two methods are used to flush a memtable to + // storage. The first one is used atdatabase RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit); + Status WriteLevel0Table(autovector& mems, VersionEdit* edit, + uint64_t* filenumber, + LogBuffer* log_buffer); + + uint64_t SlowdownAmount(int n, double bottom, double top); + // MakeRoomForWrite will return superversion_to_free through an arugment, + // which the caller needs to delete. We do it because caller can delete + // the superversion outside of mutex. + // old_log if not nullptr is the old log writer that should be safely + // closed whenever DB mutex is released. + Status MakeRoomForWrite(bool force /* compact even if there is room? */, + SuperVersion** superversion_to_free, + log::Writer** old_log); + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); + + // Force current memtable contents to be flushed. + Status FlushMemTable(const FlushOptions& options); + + // Wait for memtable flushed + Status WaitForFlushMemTable(); + + void MaybeScheduleLogDBDeployStats(); + static void BGLogDBDeployStats(void* db); + void LogDBDeployStats(); + + void MaybeScheduleFlushOrCompaction(); + static void BGWorkCompaction(void* db); + static void BGWorkFlush(void* db); + void BackgroundCallCompaction(); + void BackgroundCallFlush(); + Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state, + LogBuffer* log_buffer); + Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state, + LogBuffer* log_buffer); + void CleanupCompaction(CompactionState* compact, Status status); + Status DoCompactionWork(CompactionState* compact, + DeletionState& deletion_state, + LogBuffer* log_buffer); + + // Call compaction filter if is_compaction_v2 is not true. Then iterate + // through input and compact the kv-pairs + Status ProcessKeyValueCompaction( + SequenceNumber visible_at_tip, + SequenceNumber earliest_snapshot, + SequenceNumber latest_snapshot, + DeletionState& deletion_state, + bool bottommost_level, + int64_t& imm_micros, + Iterator* input, + CompactionState* compact, + bool is_compaction_v2, + LogBuffer* log_buffer); + + // Call compaction_filter_v2->Filter() on kv-pairs in compact + void CallCompactionFilterV2(CompactionState* compact, + CompactionFilterV2* compaction_filter_v2); + + Status OpenCompactionOutputFile(CompactionState* compact); + Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); + Status InstallCompactionResults(CompactionState* compact, + LogBuffer* log_buffer); + void AllocateCompactionOutputFileNumbers(CompactionState* compact); + void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); + + void PurgeObsoleteWALFiles(); + + Status GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType type); + + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + // return true if + bool CheckWalFileExistsAndEmpty(const WalFileType type, + const uint64_t number); + + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + WriteBatch* const result); + + Status ReadFirstLine(const std::string& fname, WriteBatch* const batch); + + void PrintStatistics(); + + // dump rocksdb.stats to LOG + void MaybeDumpStats(); + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + Status ReFitLevel(int level, int target_level = -1); + + // Returns the current SuperVersion number. + uint64_t CurrentVersionNumber() const; + + // Returns a pair of iterators (mutable-only and immutable-only) used + // internally by TailingIterator and stores CurrentVersionNumber() in + // *superversion_number. These iterators are always up-to-date, i.e. can + // be used to read new data. + std::pair GetTailingIteratorPair( + const ReadOptions& options, + uint64_t* superversion_number); + + // Constant after construction + const InternalFilterPolicy internal_filter_policy_; + bool owns_info_log_; + + // table_cache_ provides its own synchronization + unique_ptr table_cache_; + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // State below is protected by mutex_ + port::Mutex mutex_; + port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when background work finishes + MemTable* mem_; + MemTableList imm_; // Memtable that are not changing + uint64_t logfile_number_; + unique_ptr log_; + + SuperVersion* super_version_; + + // An ordinal representing the current SuperVersion. Updated by + // InstallSuperVersion(), i.e. incremented every time super_version_ + // changes. + std::atomic super_version_number_; + // Thread's local copy of SuperVersion pointer + // This needs to be destructed after mutex_ + ThreadLocalPtr* local_sv_; + + std::string host_name_; + + std::unique_ptr db_directory_; + + // Queue of writers. + std::deque writers_; + WriteBatch tmp_batch_; + + SnapshotList snapshots_; + + // Set of table files to protect from deletion because they are + // part of ongoing compactions. + std::set pending_outputs_; + + // At least one compaction or flush job is pending but not yet scheduled + // because of the max background thread limit. + bool bg_schedule_needed_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual + // compactions (if manual_compaction_ is not null). This mechanism enables + // manual compactions to wait until all other compactions are finished. + int bg_manual_only_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // Has a background stats log thread scheduled? + bool bg_logstats_scheduled_; + + // Information for a manual compaction + struct ManualCompaction { + int input_level; + int output_level; + bool done; + Status status; + bool in_progress; // compaction request being processed? + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey tmp_storage; // Used to keep track of compaction progress + }; + ManualCompaction* manual_compaction_; + + // Have we encountered a background error in paranoid mode? + Status bg_error_; + + std::unique_ptr logger_; + + int64_t volatile last_log_ts; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // last time when DeleteObsoleteFiles was invoked + uint64_t delete_obsolete_files_last_run_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + uint64_t default_interval_to_delete_obsolete_WAL_; + + bool flush_on_destroy_; // Used when disableWAL is true. + + InternalStats internal_stats_; + + static const int KEEP_LOG_FILE_NUM = 1000; + std::string db_absolute_path_; + + // count of the number of contiguous delaying writes + int delayed_writes_; + + // The options to access storage files + const EnvOptions storage_options_; + + // A value of true temporarily disables scheduling of background work + bool bg_work_gate_closed_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // Indicate DB was opened successfully + bool opened_successfully_; + + // No copying allowed + DBImpl(const DBImpl&); + void operator=(const DBImpl&); + + // dump the delayed_writes_ to the log file and reset counter. + void DelayLoggingAndReset(); + + // Return the earliest snapshot where seqno is visible. + // Store the snapshot right before that, if any, in prev_snapshot + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, + std::vector& snapshots, + SequenceNumber* prev_snapshot); + + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion + // Foreground threads call this function directly (they don't carry + // deletion state and have to handle their own creation and deletion + // of SuperVersion) + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function above. Background threads carry + // deletion_state which can have new_superversion already allocated. + void InstallSuperVersion(DeletionState& deletion_state); + + void ResetThreadLocalSuperVersions(DeletionState* deletion_state); + + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) + override; + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + Status GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr); +}; + +// Sanitize db options. The caller should delete result.info_log if +// it is not equal to src.info_log. +extern Options SanitizeOptions(const std::string& db, + const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const Options& src); + + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const Options& options, int level, + const bool enable_compression); + +// Determine compression type for L0 file written by memtable flush. +CompressionType GetCompressionFlush(const Options& options); + +} // namespace rocksdb diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc new file mode 100644 index 00000000..1b67d779 --- /dev/null +++ b/db/db_impl_readonly.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "db/db_impl_readonly.h" +#include "db/db_impl.h" + +#include +#include +#include +#include +#include +#include +#include +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "table/block.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/build_version.h" + +namespace rocksdb { + +DBImplReadOnly::DBImplReadOnly(const Options& options, + const std::string& dbname) + : DBImpl(options, dbname) { + Log(options_.info_log, "Opening the db in read only mode"); +} + +DBImplReadOnly::~DBImplReadOnly() { +} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + Status s; + SequenceNumber snapshot = versions_->LastSequence(); + SuperVersion* super_version = GetSuperVersion(); + MergeContext merge_context; + LookupKey lkey(key, snapshot); + if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) { + } else { + Version::GetStats stats; + super_version->current->Get(options, lkey, value, &s, &merge_context, + &stats); + } + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) { + SequenceNumber latest_snapshot; + Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); + return NewDBIterator( + &dbname_, env_, options_, user_comparator(),internal_iter, + (options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); +} + + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool error_if_log_file_exist) { + *dbptr = nullptr; + + DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(true /* read only */, error_if_log_file_exist); + if (s.ok()) { + delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); + } + impl->mutex_.Unlock(); + if (s.ok()) { + *dbptr = impl; + } else { + delete impl; + } + return s; +} + +} // namespace rocksdb diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h new file mode 100644 index 00000000..57eae0e2 --- /dev/null +++ b/db/db_impl_readonly.h @@ -0,0 +1,79 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#pragma once +#include "db/db_impl.h" + +#include +#include +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/stats_logger.h" + +namespace rocksdb { + +class DBImplReadOnly : public DBImpl { +public: + DBImplReadOnly(const Options& options, const std::string& dbname); + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value); + + // TODO: Implement ReadOnly MultiGet? + + virtual Iterator* NewIterator(const ReadOptions&); + + virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Merge(const WriteOptions&, const Slice& key, + const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Delete(const WriteOptions&, const Slice& key) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& options, WriteBatch* updates) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status DisableFileDeletions() { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status EnableFileDeletions(bool force) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Flush(const FlushOptions& options) { + return Status::NotSupported("Not supported operation in read only mode."); + } + +private: + friend class DB; + + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&); + void operator=(const DBImplReadOnly&); +}; + +} diff --git a/db/db_iter.cc b/db/db_iter.cc new file mode 100644 index 00000000..1b1aa539 --- /dev/null +++ b/db/db_iter.cc @@ -0,0 +1,484 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" +#include +#include + +#include "db/filename.h" +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +#if 0 +static void DumpInternalIter(Iterator* iter) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey k; + if (!ParseInternalKey(iter->key(), &k)) { + fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); + } else { + fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); + } + } +} +#endif + +namespace { + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter: public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward, the internal iterator is positioned at + // the exact entry that yields this->key(), this->value() + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { + kForward, + kReverse + }; + + DBIter(const std::string* dbname, Env* env, const Options& options, + const Comparator* cmp, Iterator* iter, SequenceNumber s) + : dbname_(dbname), + env_(env), + logger_(options.info_log.get()), + user_comparator_(cmp), + user_merge_operator_(options.merge_operator.get()), + iter_(iter), + sequence_(s), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + statistics_(options.statistics.get()) { + RecordTick(statistics_, NO_ITERATORS, 1); + max_skip_ = options.max_sequential_skip_in_iterations; + } + virtual ~DBIter() { + RecordTick(statistics_, NO_ITERATORS, -1); + delete iter_; + } + virtual bool Valid() const { return valid_; } + virtual Slice key() const { + assert(valid_); + return saved_key_.GetKey(); + } + virtual Slice value() const { + assert(valid_); + return (direction_ == kForward && !current_entry_is_merged_) ? + iter_->value() : saved_value_; + } + virtual Status status() const { + if (status_.ok()) { + return iter_->status(); + } else { + return status_; + } + } + + virtual void Next(); + virtual void Prev(); + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + + private: + inline void FindNextUserEntry(bool skipping); + void FindNextUserEntryInternal(bool skipping); + void FindPrevUserEntry(); + bool ParseKey(ParsedInternalKey* key); + void MergeValuesNewToOld(); + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + const std::string* const dbname_; + Env* const env_; + Logger* logger_; + const Comparator* const user_comparator_; + const MergeOperator* const user_merge_operator_; + Iterator* const iter_; + SequenceNumber const sequence_; + + Status status_; + IterKey saved_key_; // == current key when direction_==kReverse + std::string saved_value_; // == current raw value when direction_==kReverse + std::string skip_key_; + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + Statistics* statistics_; + uint64_t max_skip_; + + // No copying allowed + DBIter(const DBIter&); + void operator=(const DBIter&); +}; + +inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_->key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + Log(logger_, "corrupted internal key in DBIter: %s", + iter_->key().ToString(true).c_str()); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + + if (direction_ == kReverse) { // Switch directions? + direction_ = kForward; + // iter_ is pointing just before the entries for this->key(), + // so advance into the range of entries for this->key() and then + // use the normal skipping code below. + if (!iter_->Valid()) { + iter_->SeekToFirst(); + } else { + iter_->Next(); + } + if (!iter_->Valid()) { + valid_ = false; + saved_key_.Clear(); + return; + } + } + + // If the current value is merged, we might already hit end of iter_ + if (!iter_->Valid()) { + valid_ = false; + return; + } + FindNextUserEntry(true /* skipping the current user key */); +} + + +// PRE: saved_key_ has the current user key if skipping +// POST: saved_key_ should have the next user key if valid_, +// if the current entry is a result of merge +// current_entry_is_merged_ => true +// saved_value_ => the merged value +// +// NOTE: In between, saved_key_ can point to a user key that has +// a delete marker +inline void DBIter::FindNextUserEntry(bool skipping) { + StopWatchNano timer(env_, false); + StartPerfTimer(&timer); + FindNextUserEntryInternal(skipping); + BumpPerfTime(&perf_context.find_next_user_entry_time, &timer); +} + +// Actual implementation of DBIter::FindNextUserEntry() +void DBIter::FindNextUserEntryInternal(bool skipping) { + // Loop until we hit an acceptable entry to yield + assert(iter_->Valid()); + assert(direction_ == kForward); + current_entry_is_merged_ = false; + uint64_t num_skipped = 0; + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { + num_skipped++; // skip this entry + BumpPerfCount(&perf_context.internal_key_skipped_count); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + saved_key_.SetUserKey(ikey.user_key); + skipping = true; + num_skipped = 0; + BumpPerfCount(&perf_context.internal_delete_skipped_count); + break; + case kTypeValue: + valid_ = true; + saved_key_.SetUserKey(ikey.user_key); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + saved_key_.SetUserKey(ikey.user_key); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + default: + assert(false); + break; + } + } + } + // If we have sequentially iterated via numerous keys and still not + // found the next user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the last occurence of + // our current key by looking for sequence number 0. + if (skipping && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0, + kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Next(); + } + } while (iter_->Valid()); + valid_ = false; +} + +// Merge values of the same user key starting from the current iter_ position +// Scan from the newer entries to older entries. +// PRE: iter_->key() points to the first merge type entry +// saved_key_ stores the user key +// POST: saved_value_ has the merged value for the user key +// iter_ points to the next entry (or invalid) +void DBIter::MergeValuesNewToOld() { + if (!user_merge_operator_) { + Log(logger_, "Options::merge_operator is null."); + throw std::logic_error("DBIter::MergeValuesNewToOld() with" + " Options::merge_operator null"); + } + + // Start the merge process by pushing the first operand + std::deque operands; + operands.push_front(iter_->value().ToString()); + + std::string merge_result; // Temporary string to hold merge result later + ParsedInternalKey ikey; + for (iter_->Next(); iter_->Valid(); iter_->Next()) { + if (!ParseKey(&ikey)) { + // skip corrupted key + continue; + } + + if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) { + // hit the next user key, stop right here + break; + } + + if (kTypeDeletion == ikey.type) { + // hit a delete with the same user key, stop right here + // iter_ is positioned after delete + iter_->Next(); + break; + } + + if (kTypeValue == ikey.type) { + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + // ignore corruption if there is any. + const Slice value = iter_->value(); + user_merge_operator_->FullMerge(ikey.user_key, &value, operands, + &saved_value_, logger_); + // iter_ is positioned after put + iter_->Next(); + return; + } + + if (kTypeMerge == ikey.type) { + // hit a merge, add the value as an operand and run associative merge. + // when complete, add result to operands and continue. + const Slice& value = iter_->value(); + operands.push_front(value.ToString()); + } + } + + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + user_merge_operator_->FullMerge(saved_key_.GetKey(), nullptr, operands, + &saved_value_, logger_); +} + +void DBIter::Prev() { + assert(valid_); + + // Throw an exception now if merge_operator is provided + // TODO: support backward iteration + if (user_merge_operator_) { + Log(logger_, "Prev not supported yet if merge_operator is provided"); + throw std::logic_error("DBIter::Prev backward iteration not supported" + " if merge_operator is provided"); + } + + if (direction_ == kForward) { // Switch directions? + // iter_ is pointing at the current entry. Scan backwards until + // the key changes so we can use the normal reverse scanning code. + assert(iter_->Valid()); // Otherwise valid_ would have been false + saved_key_.SetUserKey(ExtractUserKey(iter_->key())); + while (true) { + iter_->Prev(); + if (!iter_->Valid()) { + valid_ = false; + saved_key_.Clear(); + ClearSavedValue(); + return; + } + if (user_comparator_->Compare(ExtractUserKey(iter_->key()), + saved_key_.GetKey()) < 0) { + break; + } + } + direction_ = kReverse; + } + + FindPrevUserEntry(); +} + +void DBIter::FindPrevUserEntry() { + assert(direction_ == kReverse); + uint64_t num_skipped = 0; + + ValueType value_type = kTypeDeletion; + bool saved_key_valid = true; + if (iter_->Valid()) { + do { + ParsedInternalKey ikey; + if (ParseKey(&ikey) && ikey.sequence <= sequence_) { + if ((value_type != kTypeDeletion) && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) < 0) { + // We encountered a non-deleted value in entries for previous keys, + break; + } + value_type = ikey.type; + if (value_type == kTypeDeletion) { + saved_key_.Clear(); + ClearSavedValue(); + saved_key_valid = false; + } else { + Slice raw_value = iter_->value(); + if (saved_value_.capacity() > raw_value.size() + 1048576) { + std::string empty; + swap(empty, saved_value_); + } + saved_key_.SetUserKey(ExtractUserKey(iter_->key())); + saved_value_.assign(raw_value.data(), raw_value.size()); + } + } else { + // In the case of ikey.sequence > sequence_, we might have already + // iterated to a different user key. + saved_key_valid = false; + } + num_skipped++; + // If we have sequentially iterated via numerous keys and still not + // found the prev user-key, then it is better to seek so that we can + // avoid too many key comparisons. We seek to the first occurence of + // our current key by looking for max sequence number. + if (saved_key_valid && num_skipped > max_skip_) { + num_skipped = 0; + std::string last_key; + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), + kMaxSequenceNumber, + kValueTypeForSeek)); + iter_->Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_->Prev(); + } + } while (iter_->Valid()); + } + + if (value_type == kTypeDeletion) { + // End + valid_ = false; + saved_key_.Clear(); + ClearSavedValue(); + direction_ = kForward; + } else { + valid_ = true; + } +} + +void DBIter::Seek(const Slice& target) { + saved_key_.Clear(); + // now savved_key is used to store internal key. + saved_key_.SetInternalKey(target, sequence_); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); + iter_->Seek(saved_key_.GetKey()); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); + if (iter_->Valid()) { + direction_ = kForward; + ClearSavedValue(); + FindNextUserEntry(false /*not skipping */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToFirst() { + direction_ = kForward; + ClearSavedValue(); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); + iter_->SeekToFirst(); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); + if (iter_->Valid()) { + FindNextUserEntry(false /* not skipping */); + } else { + valid_ = false; + } +} + +void DBIter::SeekToLast() { + // Throw an exception for now if merge_operator is provided + // TODO: support backward iteration + if (user_merge_operator_) { + Log(logger_, "SeekToLast not supported yet if merge_operator is provided"); + throw std::logic_error("DBIter::SeekToLast: backward iteration not" + " supported if merge_operator is provided"); + } + + direction_ = kReverse; + ClearSavedValue(); + StopWatchNano internal_seek_timer(env_, false); + StartPerfTimer(&internal_seek_timer); + iter_->SeekToLast(); + BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); + FindPrevUserEntry(); +} + +} // anonymous namespace + +Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Options& options, + const Comparator *user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence) { + return new DBIter(dbname, env, options, user_key_comparator, + internal_iter, sequence); +} + +} // namespace rocksdb diff --git a/db/db_iter.h b/db/db_iter.h new file mode 100644 index 00000000..b44e6745 --- /dev/null +++ b/db/db_iter.h @@ -0,0 +1,28 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/db.h" +#include "db/dbformat.h" + +namespace rocksdb { + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified "sequence" number +// into appropriate user keys. +extern Iterator* NewDBIterator( + const std::string* dbname, + Env* env, + const Options& options, + const Comparator *user_key_comparator, + Iterator* internal_iter, + const SequenceNumber& sequence); + +} // namespace rocksdb diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc new file mode 100644 index 00000000..db86865c --- /dev/null +++ b/db/db_stats_logger.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" +#include +#include +#include +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +void DBImpl::MaybeScheduleLogDBDeployStats() { + + // There is a lock in the actual logger. + if (!logger_ || options_.db_stats_log_interval < 0 + || host_name_.empty()) { + return; + } + + if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) { + // Already scheduled + } else { + int64_t current_ts = 0; + Status st = env_->GetCurrentTime(¤t_ts); + if (!st.ok()) { + return; + } + if ((current_ts - last_log_ts) < options_.db_stats_log_interval) { + return; + } + last_log_ts = current_ts; + bg_logstats_scheduled_ = true; + env_->Schedule(&DBImpl::BGLogDBDeployStats, this); + } +} + +void DBImpl::BGLogDBDeployStats(void* db) { + DBImpl* db_inst = reinterpret_cast(db); + db_inst->LogDBDeployStats(); +} + +void DBImpl::LogDBDeployStats() { + mutex_.Lock(); + + if (shutting_down_.Acquire_Load()) { + bg_logstats_scheduled_ = false; + bg_cv_.SignalAll(); + mutex_.Unlock(); + return; + } + + char tmp_ver[100]; + sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion); + std::string version_info(tmp_ver); + + uint64_t file_total_size = 0; + uint32_t file_total_num = 0; + Version* current = versions_->current(); + for (int i = 0; i < current->NumberLevels(); i++) { + file_total_num += current->NumLevelFiles(i); + file_total_size += current->NumLevelBytes(i); + } + + Version::LevelSummaryStorage scratch; + const char* file_num_summary = current->LevelSummary(&scratch); + std::string file_num_per_level(file_num_summary); + std::string data_size_per_level(file_num_summary); + + mutex_.Unlock(); + + int64_t unix_ts; + env_->GetCurrentTime(&unix_ts); + + logger_->Log_Deploy_Stats(version_info, host_name_, + db_absolute_path_, file_total_size, file_total_num, file_num_per_level, + data_size_per_level, unix_ts); + + mutex_.Lock(); + bg_logstats_scheduled_ = false; + bg_cv_.SignalAll(); + mutex_.Unlock(); +} + +} diff --git a/db/db_test.cc b/db/db_test.cc new file mode 100644 index 00000000..f6da11c9 --- /dev/null +++ b/db/db_test.cc @@ -0,0 +1,6346 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "util/hash.h" +#include "util/hash_linklist_rep.h" +#include "utilities/merge_operators.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/statistics.h" +#include "util/testharness.h" +#include "util/sync_point.h" +#include "util/testutil.h" + +namespace rocksdb { + +static bool SnappyCompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(options, in.data(), in.size(), &out); +} + +static bool ZlibCompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Zlib_Compress(options, in.data(), in.size(), &out); +} + +static bool BZip2CompressionSupported(const CompressionOptions& options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::BZip2_Compress(options, in.data(), in.size(), &out); +} + +static bool LZ4CompressionSupported(const CompressionOptions &options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4_Compress(options, in.data(), in.size(), &out); +} + +static bool LZ4HCCompressionSupported(const CompressionOptions &options) { + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4HC_Compress(options, in.data(), in.size(), &out); +} + +static std::string RandomString(Random *rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +namespace anon { +class AtomicCounter { + private: + port::Mutex mu_; + int count_; + public: + AtomicCounter() : count_(0) { } + void Increment() { + MutexLock l(&mu_); + count_++; + } + int Read() { + MutexLock l(&mu_); + return count_; + } + void Reset() { + MutexLock l(&mu_); + count_ = 0; + } +}; + +} + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + // sstable Sync() calls are blocked while this pointer is non-nullptr. + port::AtomicPointer delay_sstable_sync_; + + // Simulate no-space errors while this pointer is non-nullptr. + port::AtomicPointer no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + port::AtomicPointer non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + port::AtomicPointer manifest_write_error_; + + // Force write to log files to fail while this pointer is non-nullptr + port::AtomicPointer log_write_error_; + + bool count_random_reads_; + anon::AtomicCounter random_read_counter_; + + anon::AtomicCounter sleep_counter_; + + explicit SpecialEnv(Env* base) : EnvWrapper(base) { + delay_sstable_sync_.Release_Store(nullptr); + no_space_.Release_Store(nullptr); + non_writable_.Release_Store(nullptr); + count_random_reads_ = false; + manifest_sync_error_.Release_Store(nullptr); + manifest_write_error_.Release_Store(nullptr); + log_write_error_.Release_Store(nullptr); + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, unique_ptr&& base) + : env_(env), + base_(std::move(base)) { + } + Status Append(const Slice& data) { + if (env_->no_space_.Acquire_Load() != nullptr) { + // Drop writes on the floor + return Status::OK(); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { + env_->SleepForMicroseconds(100000); + } + return base_->Sync(); + } + }; + class ManifestFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + ManifestFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->manifest_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { + if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated sync error"); + } else { + return base_->Sync(); + } + } + }; + class LogFile : public WritableFile { + private: + SpecialEnv* env_; + unique_ptr base_; + public: + LogFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) { } + Status Append(const Slice& data) { + if (env_->log_write_error_.Acquire_Load() != nullptr) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { return base_->Sync(); } + }; + + if (non_writable_.Acquire_Load() != nullptr) { + return Status::IOError("simulated write error"); + } + + Status s = target()->NewWritableFile(f, r, soptions); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "log") != nullptr) { + r->reset(new LogFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public RandomAccessFile { + private: + unique_ptr target_; + anon::AtomicCounter* counter_; + public: + CountingFile(unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) { + } + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + counter_->Increment(); + return target_->Read(offset, n, result, scratch); + } + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + if (s.ok() && count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) { + sleep_counter_.Increment(); + target()->SleepForMicroseconds(micros); + } +}; + +class DBTest { + private: + const FilterPolicy* filter_policy_; + + protected: + // Sequence of option configurations to try + enum OptionConfig { + kDefault, + kBlockBasedTableWithPrefixHashIndex, + kBlockBasedTableWithWholeKeyHashIndex, + kPlainTableFirstBytePrefix, + kPlainTableAllBytesPrefix, + kVectorRep, + kHashLinkList, + kMergePut, + kFilter, + kUncompressed, + kNumLevel_3, + kDBLogDir, + kWalDir, + kManifestFileSize, + kCompactOnFlush, + kPerfOptions, + kDeletesFilterFirst, + kHashSkipList, + kUniversalCompaction, + kCompressedBlockCache, + kInfiniteMaxOpenFiles, + kEnd + }; + int option_config_; + + public: + std::string dbname_; + SpecialEnv* env_; + DB* db_; + + Options last_options_; + + // Skip some options, as they may not be applicable to a specific test. + // To add more skip constants, use values 4, 8, 16, etc. + enum OptionSkip { + kNoSkip = 0, + kSkipDeletesFilterFirst = 1, + kSkipUniversalCompaction = 2, + kSkipMergePut = 4, + kSkipPlainTable = 8, + kSkipHashIndex = 16 + }; + + DBTest() : option_config_(kDefault), + env_(new SpecialEnv(Env::Default())) { + last_options_.max_background_flushes = 0; + filter_policy_ = NewBloomFilterPolicy(10); + dbname_ = test::TmpDir() + "/db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~DBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + delete env_; + delete filter_policy_; + } + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions(int skip_mask = kNoSkip) { + // skip some options + for(option_config_++; option_config_ < kEnd; option_config_++) { + if ((skip_mask & kSkipDeletesFilterFirst) && + option_config_ == kDeletesFilterFirst) { + continue; + } + if ((skip_mask & kSkipUniversalCompaction) && + option_config_ == kUniversalCompaction) { + continue; + } + if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) { + continue; + } + if ((skip_mask & kSkipPlainTable) + && (option_config_ == kPlainTableAllBytesPrefix + || option_config_ == kPlainTableFirstBytePrefix)) { + continue; + } + if ((skip_mask & kSkipPlainTable) && + (option_config_ == kBlockBasedTableWithPrefixHashIndex || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) { + continue; + } + + break; + } + + if (option_config_ >= kEnd) { + Destroy(&last_options_); + return false; + } else { + DestroyAndReopen(); + return true; + } + } + + // Switch between different compaction styles (we have only 2 now). + bool ChangeCompactOptions(Options* prev_options = nullptr) { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + if (prev_options == nullptr) { + prev_options = &last_options_; + } + Destroy(prev_options); + TryReopen(); + return true; + } else { + return false; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.paranoid_checks = false; + options.max_background_flushes = 0; + switch (option_config_) { + case kHashSkipList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); + break; + case kPlainTableFirstBytePrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + break; + case kPlainTableAllBytesPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + break; + case kMergePut: + options.merge_operator = MergeOperators::CreatePutOperator(); + break; + case kFilter: + options.filter_policy = filter_policy_; + break; + case kUncompressed: + options.compression = kNoCompression; + break; + case kNumLevel_3: + options.num_levels = 3; + break; + case kDBLogDir: + options.db_log_dir = test::TmpDir(); + break; + case kWalDir: + options.wal_dir = "/tmp/wal"; + break; + case kManifestFileSize: + options.max_manifest_file_size = 50; // 50 bytes + case kCompactOnFlush: + options.purge_redundant_kvs_while_flush = + !options.purge_redundant_kvs_while_flush; + break; + case kPerfOptions: + options.hard_rate_limit = 2.0; + options.rate_limit_delay_max_milliseconds = 2; + // TODO -- test more options + break; + case kDeletesFilterFirst: + options.filter_deletes = true; + break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory(100)); + break; + case kHashLinkList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4)); + break; + case kUniversalCompaction: + options.compaction_style = kCompactionStyleUniversal; + break; + case kCompressedBlockCache: + options.allow_mmap_writes = true; + options.block_cache_compressed = NewLRUCache(8*1024*1024); + break; + case kInfiniteMaxOpenFiles: + options.max_open_files = -1; + break; + case kBlockBasedTableWithPrefixHashIndex: { + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + break; + } + case kBlockBasedTableWithWholeKeyHashIndex: { + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewNoopTransform()); + break; + } + default: + break; + } + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status ReadOnlyReopen(Options* options) { + return DB::OpenForReadOnly(*options, dbname_, &db_); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + if (kMergePut == option_config_ ) { + return db_->Merge(wo, k, v); + } else { + return db_->Put(wo, k, v); + } + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents() { + std::vector forward; + std::string result; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + unsigned int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_LT(matched, forward.size()); + ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + ASSERT_EQ(matched, forward.size()); + + delete iter; + return result; + } + + std::string AllEntriesFor(const Slice& user_key) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeMerge: + // keep it the same as kTypeValue for testing kMergePut + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + default: + assert(false); + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + delete iter; + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + int TotalTableFiles() { + int result = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + result += NumTableFilesAtLevel(level); + } + return result; + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + int CountFiles() { + std::vector files; + env_->GetChildren(dbname_, &files); + + std::vector logfiles; + if (dbname_ != last_options_.wal_dir) { + env_->GetChildren(last_options_.wal_dir, &logfiles); + } + + return static_cast(files.size() + logfiles.size()); + } + + int CountLiveFiles() { + std::vector files; + uint64_t manifest_file_size; + db_->GetLiveFiles(files, &manifest_file_size); + return files.size(); + } + + uint64_t Size(const Slice& start, const Slice& limit) { + Range r(start, limit); + uint64_t size; + db_->GetApproximateSizes(&r, 1, &size); + return size; + } + + void Compact(const Slice& start, const Slice& limit) { + db_->CompactRange(&start, &limit); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large) { + for (int i = 0; i < n; i++) { + Put(small, "begin"); + Put(large, "end"); + dbfull()->TEST_FlushMemTable(); + } + } + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest) { + MakeTables(db_->NumberLevels(), smallest, largest); + } + + void DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %lld\n", + static_cast( + dbfull()->TEST_MaxNextLevelOverlappingBytes())); + for (int level = 0; level < db_->NumberLevels(); level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } + } + + std::string DumpSSTableList() { + std::string property; + db_->GetProperty("rocksdb.sstables", &property); + return property; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } + + Options OptionsForLogIterTest() { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + return options; + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(seq, &iter); + ASSERT_OK(status); + ASSERT_TRUE(iter->Valid()); + return std::move(iter); + } + + std::string DummyString(size_t len, char c = 'a') { + return std::string(len, c); + } + + void VerifyIterLast(std::string expected_key) { + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), expected_key); + delete iter; + } + + // Used to test InplaceUpdate + + // If previous value is nullptr or delta is > than previous value, + // sets newValue with delta + // If previous value is not empty, + // updates previous value with 'b' string of previous value size - 1. + static UpdateStatus + updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = *prevSize - 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } + } + + static UpdateStatus + updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } + } + + static UpdateStatus + updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } + + static UpdateStatus + updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue) { + return UpdateStatus::UPDATE_FAILED; + } + + // Utility method to test InplaceUpdate + void validateNumberOfEntries(int numValues) { + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + + // checks sequence number for updates + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + delete iter; + ASSERT_EQ(0, seq); + } + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0) { + const EnvOptions soptions; + unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + +}; + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); +} + +static long TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is differnt for each of the tables. +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + assert(props.size() == 4); + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); +} + +std::unordered_map GetMemoryUsage(MemTable* memtable) { + const auto& arena = memtable->TEST_GetArena(); + return {{"memtable.approximate.usage", memtable->ApproximateMemoryUsage()}, + {"arena.approximate.usage", arena.ApproximateMemoryUsage()}, + {"arena.allocated.memory", arena.MemoryAllocatedBytes()}, + {"arena.unused.bytes", arena.AllocatedAndUnused()}, + {"irregular.blocks", arena.IrregularBlockNum()}}; +} + +void PrintMemoryUsage(const std::unordered_map& usage) { + for (const auto& item : usage) { + std::cout << "\t" << item.first << ": " << item.second << std::endl; + } +} + +void AddRandomKV(MemTable* memtable, Random* rnd, size_t arena_block_size) { + memtable->Add(0, kTypeValue, RandomString(rnd, 20) /* key */, + // make sure we will be able to generate some over sized entries + RandomString(rnd, rnd->Uniform(arena_block_size / 4) * 1.15 + + 10) /* value */); +} + +TEST(DBTest, Empty) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + } while (ChangeOptions()); +} + +TEST(DBTest, ReadOnlyDB) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + Options options; + ASSERT_OK(ReadOnlyReopen(&options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + Iterator* iter = db_->NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + ASSERT_EQ(count, 2); + delete iter; +} + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + std::unique_ptr filter_policy(NewBloomFilterPolicy(20)); + options.filter_policy = filter_policy.get(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + DestroyAndReopen(&options); + + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + // Create a new talbe. + ASSERT_OK(dbfull()->Flush(FlushOptions())); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get("key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + value = Get("key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(index_block_hit + 2, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); +} + +TEST(DBTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + Reopen(&options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + // 1. Read table properties directly from file + Reopen(&options); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(&options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(std::to_string(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(&options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 4; ++i) { + Get(std::to_string(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); +} + +TEST(DBTest, LevelLimitReopen) { + Options options = CurrentOptions(); + Reopen(&options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2) == 0) { + ASSERT_OK(Put(Key(i++), value)); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopen(&options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopen(&options)); +} + +TEST(DBTest, Preallocation) { + const std::string src = dbname_ + "/alloc_test"; + unique_ptr srcfile; + const EnvOptions soptions; + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + +TEST(DBTest, PutDeleteGet) { + do { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(db_->Delete(WriteOptions(), "foo")); + ASSERT_EQ("NOT_FOUND", Get("foo")); + } while (ChangeOptions()); +} + + +TEST(DBTest, GetFromImmutableLayer) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put("k1", std::string(100000, 'x')); // Fill memtable + Put("k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + } while (ChangeOptions()); +} + +TEST(DBTest, GetFromVersions) { + do { + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetSnapshot) { + do { + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get(key)); + ASSERT_EQ("v1", Get(key, s1)); + db_->ReleaseSnapshot(s1); + } + } while (ChangeOptions()); +} + +TEST(DBTest, GetLevel0Ordering) { + do { + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put("bar", "b")); + ASSERT_OK(Put("foo", "v1")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetOrderedByLevels) { + do { + ASSERT_OK(Put("foo", "v1")); + Compact("a", "z"); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_EQ("v2", Get("foo")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetPicksCorrectFile) { + do { + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put("a", "va")); + Compact("a", "b"); + ASSERT_OK(Put("x", "vx")); + Compact("x", "y"); + ASSERT_OK(Put("f", "vf")); + Compact("f", "g"); + ASSERT_EQ("va", Get("a")); + ASSERT_EQ("vf", Get("f")); + ASSERT_EQ("vx", Get("x")); + } while (ChangeOptions()); +} + +TEST(DBTest, GetEncountersEmptyLevel) { + do { + // Arrange for the following to happen: + // * sstable A in level 0 + // * nothing in level 1 + // * sstable B in level 2 + // Then do enough Get() calls to arrange for an automatic compaction + // of sstable A. A bug would cause the compaction to be marked as + // occuring at level 1 (instead of the correct level 0). + + // Step 1: First place sstables in levels 0 and 2 + int compaction_count = 0; + while (NumTableFilesAtLevel(0) == 0 || + NumTableFilesAtLevel(2) == 0) { + ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; + compaction_count++; + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + } + + // Step 2: clear level 1 if necessary. + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + // Step 3: read a bunch of times + for (int i = 0; i < 1000; i++) { + ASSERT_EQ("NOT_FOUND", Get("missing")); + } + + // Step 4: Wait for compaction to finish + env_->SleepForMicroseconds(1000000); + + ASSERT_EQ(NumTableFilesAtLevel(0), 1); // XXX + } while (ChangeOptions(kSkipUniversalCompaction)); +} + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST(DBTest, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.statistics = rocksdb::CreateDBStatistics(); + Reopen(&options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + bool value_found = false; + ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + dbfull()->Flush(FlushOptions()); + value.clear(); + + long numopen = TestGetTickerCount(options, NO_FILE_OPENS); + long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(db_->Delete(WriteOptions(), "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + dbfull()->Flush(FlushOptions()); + dbfull()->CompactRange(nullptr, nullptr); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(db_->Delete(WriteOptions(), "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + delete options.filter_policy; + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex)); +} + +TEST(DBTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + Reopen(&options); + // write one kv to the database. + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + dbfull()->Flush(FlushOptions()); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + long numopen = TestGetTickerCount(options, NO_FILE_OPENS); + long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get("a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = db_->NewIterator(non_blocking_opts); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + } while (ChangeOptions(kSkipPlainTable)); +} + +// A delete is skipped for key if KeyMayExist(key) returns False +// Tests Writebatch consistency and proper delete behaviour +TEST(DBTest, FilterDeletes) { + do { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.filter_deletes = true; + Reopen(&options); + WriteBatch batch; + + batch.Delete("a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("a"), "[ ]"); // Delete skipped + batch.Clear(); + + batch.Put("a", "b"); + batch.Delete("a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get("a"), "NOT_FOUND"); + ASSERT_EQ(AllEntriesFor("a"), "[ DEL, b ]"); // Delete issued + batch.Clear(); + + batch.Delete("c"); + batch.Put("c", "d"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get("c"), "d"); + ASSERT_EQ(AllEntriesFor("c"), "[ d ]"); // Delete skipped + batch.Clear(); + + dbfull()->Flush(FlushOptions()); // A stray Flush + + batch.Delete("c"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("c"), "[ DEL, d ]"); // Delete issued + batch.Clear(); + + delete options.filter_policy; + } while (ChangeCompactOptions()); +} + + +TEST(DBTest, IterSeekBeforePrev) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter->Prev(); + iter->Seek(Slice("a")); + iter->Prev(); + delete iter; +} + +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} + +TEST(DBTest, IterLongKeys) { + ASSERT_OK(Put(MakeLongKey(20, 0), "0")); + ASSERT_OK(Put(MakeLongKey(32, 2), "2")); + ASSERT_OK(Put("a", "b")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put(MakeLongKey(50, 1), "1")); + ASSERT_OK(Put(MakeLongKey(127, 3), "3")); + ASSERT_OK(Put(MakeLongKey(64, 4), "4")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + iter->Seek(MakeLongKey(20, 0)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + delete iter; + + iter = db_->NewIterator(ReadOptions()); + iter->Seek(MakeLongKey(50, 1)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + delete iter; +} + + +TEST(DBTest, IterNextWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("a")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + delete iter; +} + +TEST(DBTest, IterPrevWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST(DBTest, IterPrevWithNewerSeq2) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = db_->NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + ASSERT_EQ(IterStatus(iter), "c->d"); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + + iter->Prev(); + delete iter; +} + +TEST(DBTest, IterEmpty) { + do { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterSingle) { + do { + ASSERT_OK(Put("a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterMulti) { + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put("a", "va2")); + ASSERT_OK(Put("a2", "va3")); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Put("c", "vc2")); + ASSERT_OK(Delete("b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST(DBTest, IterReseek) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(&options); + + // insert two keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put("a", "one")); + ASSERT_OK(Put("a", "two")); + ASSERT_OK(Put("b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put("a", "three")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put("a", "four")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = + (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put("b", "btwo")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put("b", "bthree")); + ASSERT_OK(Put("b", "bfour")); + iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST(DBTest, IterSmallAndLargeMix) { + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", std::string(100000, 'b'))); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Put("d", std::string(100000, 'd'))); + ASSERT_OK(Put("e", std::string(100000, 'e'))); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IterMultiWithDelete) { + do { + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Delete("b")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->Seek("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + } + delete iter; + } while (ChangeOptions()); +} + +TEST(DBTest, IterPrevMaxSkip) { + do { + for (int i = 0; i < 2; i++) { + db_->Put(WriteOptions(), "key1", "v1"); + db_->Put(WriteOptions(), "key2", "v2"); + db_->Put(WriteOptions(), "key3", "v3"); + db_->Put(WriteOptions(), "key4", "v4"); + db_->Put(WriteOptions(), "key5", "v5"); + } + + VerifyIterLast("key5->v5"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key5")); + VerifyIterLast("key4->v4"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key4")); + VerifyIterLast("key3->v3"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key3")); + VerifyIterLast("key2->v2"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key2")); + VerifyIterLast("key1->v1"); + + ASSERT_OK(db_->Delete(WriteOptions(), "key1")); + VerifyIterLast("(invalid)"); + } while (ChangeOptions(kSkipMergePut)); +} + +TEST(DBTest, IterWithSnapshot) { + do { + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Put("key4", "val4")); + ASSERT_OK(Put("key5", "val5")); + + const Snapshot *snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = db_->NewIterator(options); + + // Put more values after the snapshot + ASSERT_OK(Put("key100", "val100")); + ASSERT_OK(Put("key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + db_->ReleaseSnapshot(snapshot); + delete iter; + } while (ChangeOptions()); +} + +TEST(DBTest, Recover) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v5", Get("baz")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + + Reopen(); + ASSERT_EQ("v3", Get("foo")); + ASSERT_OK(Put("foo", "v4")); + ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v5", Get("baz")); + } while (ChangeOptions()); +} + +TEST(DBTest, RecoverWithTableHandle) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 100; + options.disable_auto_compactions = true; + DestroyAndReopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Put("bar", "v4")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("big", std::string(100, 'a'))); + Reopen(); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(&files); + int total_files = 0; + for (const auto& level : files) { + total_files += level.size(); + } + ASSERT_EQ(total_files, 3); + for (const auto& level : files) { + for (const auto& file : level) { + if (kInfiniteMaxOpenFiles == option_config_) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } else { + ASSERT_TRUE(file.table_reader_handle == nullptr); + } + } + } + } while (ChangeOptions()); +} + +TEST(DBTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + env_->CreateDirIfMissing(backup_logs); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.wal_dir = dbname_ + "/logs"; + DestroyAndReopen(&options); + + // fill up the DB + std::string one, two; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(options.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Reopen(&options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // this should ignore the log files, recovery should not happen again + // if the recovery happens, the same merge operator would be called twice, + // leading to incorrect results + Reopen(&options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + Destroy(&options); + + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + env_->DeleteFile(backup_logs + "/" + log); + } + } + // assert that we successfully recovered only from logs, even though we + // destroyed the DB + Reopen(&options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + } while (ChangeOptions()); +} + +TEST(DBTest, RollLog) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("baz", "v5")); + + Reopen(); + for (int i = 0; i < 10; i++) { + Reopen(); + } + ASSERT_OK(Put("foo", "v4")); + for (int i = 0; i < 10; i++) { + Reopen(); + } + } while (ChangeOptions()); +} + +TEST(DBTest, WAL) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v1", Get("bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2")); + + Reopen(); + // Both value's should be present. + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v2", Get("foo")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3")); + + Reopen(); + // again both values should be present. + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v3", Get("bar")); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CheckLock) { + do { + DB* localdb; + Options options = CurrentOptions(); + ASSERT_OK(TryReopen(&options)); + + // second open should fail + ASSERT_TRUE(!(PureReopen(&options, &localdb)).ok()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + Reopen(&options); + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v1", Get("bar")); + dbfull()->Flush(FlushOptions()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.write_buffer_size = 1000000; + Reopen(&options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + SetPerfLevel(kEnableTime);; + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k2"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cur-size-active-mem-table", + &num)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + perf_context.Reset(); + Get("k2"); + ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k3"); + ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); + perf_context.Reset(); + Get("k1"); + ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); + + dbfull()->Flush(FlushOptions()); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cur-size-active-mem-table", + &num)); + // "208" is the size of the metadata of an empty skiplist, this would + // break if we change the default skiplist implementation + ASSERT_EQ(num, "208"); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +class SleepingBackgroundTask { + public: + SleepingBackgroundTask() : bg_cv_(&mutex_), should_sleep_(true) {} + void DoSleep() { + MutexLock l(&mutex_); + while (should_sleep_) { + bg_cv_.Wait(); + } + } + void WakeUp() { + MutexLock l(&mutex_); + should_sleep_ = false; + bg_cv_.SignalAll(); + } + + static void DoSleepTask(void* arg) { + reinterpret_cast(arg)->DoSleep(); + } + + private: + port::Mutex mutex_; + port::CondVar bg_cv_; // Signalled when background work finishes + bool should_sleep_; +}; + +TEST(DBTest, GetProperty) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high, + Env::Priority::HIGH); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = 1; + options.compaction_options_universal.size_ratio = 50; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.write_buffer_size = 1000000; + Reopen(&options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + SetPerfLevel(kEnableTime); + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + perf_context.Reset(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + + sleeping_task_high.WakeUp(); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "1"); + sleeping_task_low.WakeUp(); +} + +TEST(DBTest, FLUSH) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime);; + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + // this will now also flush the last 2 writes + dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + + perf_context.Reset(); + Get("foo"); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + + Reopen(); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v1", Get("bar")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2")); + dbfull()->Flush(FlushOptions()); + + Reopen(); + ASSERT_EQ("v2", Get("bar")); + perf_context.Reset(); + ASSERT_EQ("v2", Get("foo")); + ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3")); + dbfull()->Flush(FlushOptions()); + + Reopen(); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v3", Get("bar")); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, RecoveryWithEmptyLog) { + do { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + Reopen(); + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); + } while (ChangeOptions()); +} + +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST(DBTest, RecoverDuringMemtableCompaction) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 1000000; + Reopen(&options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put("bar", "v2")); // Goes to new log file + + Reopen(&options); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); + ASSERT_EQ(std::string(1000, 'y'), Get("big2")); + } while (ChangeOptions()); +} + +TEST(DBTest, MinorCompactionsHappen) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 10000; + Reopen(&options); + + const int N = 500; + + int starting_num_tables = TotalTableFiles(); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = TotalTableFiles(); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + + Reopen(); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ManifestRollOver) { + do { + Options options = CurrentOptions(); + options.max_manifest_file_size = 10 ; // 10 bytes + Reopen(&options); + { + ASSERT_OK(Put("manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put("manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put("manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = + dbfull()->TEST_Current_Manifest_FileNo(); + dbfull()->Flush(FlushOptions()); // This should trigger LogAndApply. + uint64_t manifest_after_flush = + dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + Reopen(&options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), + manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get("manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get("manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get("manifest_key3")); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, IdentityAcrossRestarts) { + do { + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); + + Options options = CurrentOptions(); + Reopen(&options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); + + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(&options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + Reopen(&options); + ASSERT_OK(Put("big1", std::string(200000, '1'))); + ASSERT_OK(Put("big2", std::string(200000, '2'))); + ASSERT_OK(Put("small3", std::string(10, '3'))); + ASSERT_OK(Put("small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + Reopen(&options); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(std::string(200000, '1'), Get("big1")); + ASSERT_EQ(std::string(200000, '2'), Get("big2")); + ASSERT_EQ(std::string(10, '3'), Get("small3")); + ASSERT_EQ(std::string(10, '4'), Get("small4")); + ASSERT_GT(NumTableFilesAtLevel(0), 1); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionsGenerateMultipleFiles) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to level-0 + Reopen(&options); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST(DBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; + num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +// This is a static filter used for filtering +// kvs during the compaction process. +static int cfilter_count; +static std::string NEW_VALUE = "NewValue"; + +class KeepFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, bool* value_changed) const + override { + cfilter_count++; + return false; + } + + virtual const char* Name() const override { return "KeepFilter"; } +}; + +class DeleteFilter : public CompactionFilter { + public: + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, bool* value_changed) const + override { + cfilter_count++; + return true; + } + + virtual const char* Name() const override { return "DeleteFilter"; } +}; + +class ChangeFilter : public CompactionFilter { + public: + explicit ChangeFilter() {} + + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, bool* value_changed) const + override { + assert(new_value != nullptr); + *new_value = NEW_VALUE; + *value_changed = true; + return false; + } + + virtual const char* Name() const override { return "ChangeFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + ASSERT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + ASSERT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } + + virtual const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; + +class DeleteFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new DeleteFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + virtual const char* Name() const override { return "DeleteFilterFactory"; } +}; + +class ChangeFilterFactory : public CompactionFilterFactory { + public: + explicit ChangeFilterFactory() {} + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr(new ChangeFilter()); + } + + virtual const char* Name() const override { return "ChangeFilterFactory"; } +}; + +// TODO(kailiu) The tests on UniversalCompaction has some issues: +// 1. A lot of magic numbers ("11" or "12"). +// 2. Made assumption on the memtable flush conidtions, which may change from +// time to time. +TEST(DBTest, UniversalCompactionTrigger) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + filter->expect_full_compaction_.store(false); + dbfull()->Flush(FlushOptions()); + for (int num = 0; + num < options.level0_file_num_compaction_trigger-3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After comapction, we should have 2 files, with size 4, 2.4. + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 3: + // Now we have 2 files at level 0, with size 4 and 2.4. Continue + // generating new files at level 0. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. + // After comapction, we should have 3 files, with size 4, 2.4, 2. + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 4: + // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a + // new file of size 1. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumTableFilesAtLevel(0), 4); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // Stage 5: + // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate + // a new file of size 1. + filter->expect_full_compaction_.store(true); + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // All files at level 0 will be compacted into a single one. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } +} + +TEST(DBTest, UniversalCompactionSizeAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 3; + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal. + max_size_amplification_percent = 110; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but will instead trigger size amplification. + dbfull()->Flush(FlushOptions()); + + dbfull()->TEST_WaitForCompact(); + + // Verify that size amplification did occur + ASSERT_EQ(NumTableFilesAtLevel(0), 1); +} + +TEST(DBTest, UniversalCompactionOptions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = -1; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + for (int num = 0; + num < options.level0_file_num_compaction_trigger; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + + if (num < options.level0_file_num_compaction_trigger - 1) { + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + } + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 1; i < options.num_levels ; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } +} + +TEST(DBTest, UniversalCompactionStopStyleSimilarSize) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.stop_style = kCompactionStopStyleSimilarSize; + options.num_levels=1; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; + num < options.level0_file_num_compaction_trigger-1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + dbfull()->Flush(FlushOptions()); + for (int num = 0; + num < options.level0_file_num_compaction_trigger-3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After compaction, we should have 3 files, with size 4, 0.4, 2. + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + // Stage 3: + // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one + // more file at level-0, which should trigger level-0 compaction. + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumTableFilesAtLevel(0), 4); +} + +#if defined(SNAPPY) && defined(ZLIB) && defined(BZIP2) +TEST(DBTest, CompressedCache) { + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + for (int iter = 0; iter < 3; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64*1024; // small write buffer + options.statistics = rocksdb::CreateDBStatistics(); + + switch (iter) { + case 0: + // only uncompressed block cache + options.block_cache = NewLRUCache(8*1024); + options.block_cache_compressed = nullptr; + break; + case 1: + // no block cache, only compressed cache + options.no_block_cache = true; + options.block_cache = nullptr; + options.block_cache_compressed = NewLRUCache(8*1024); + break; + case 2: + // both compressed and uncompressed block cache + options.block_cache = NewLRUCache(1024); + options.block_cache_compressed = NewLRUCache(8*1024); + break; + default: + ASSERT_TRUE(false); + } + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + dbfull()->Flush(FlushOptions()); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + default: + ASSERT_TRUE(false); + } + } +} + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} + +TEST(DBTest, UniversalCompactionCompressRatio1) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = 70; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // The first compaction (2) is compressed. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 2 * 0.9); + + // The second compaction (4) is compressed + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 4 * 0.9); + + // The third compaction (2 4) is compressed since this time it is + // (1 1 3.2) and 3.2/5.2 doesn't reach ratio. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), 110000 * 6 * 0.9); + + // When we start for the compaction up to (2 4 8), the latest + // compressed is not compressed. + for (int num = 0; num < 8; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT((int)dbfull()->TEST_GetLevel0TotalSize(), + 110000 * 11 * 0.8 + 110000 * 2); +} + +TEST(DBTest, UniversalCompactionCompressRatio2) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.compaction_options_universal.compression_size_percent = 95; + Reopen(&options); + + Random rnd(301); + int key_idx = 0; + + // When we start for the compaction up to (2 4 8), the latest + // compressed is compressed given the size ratio to compress. + for (int num = 0; num < 14; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT((int)dbfull()->TEST_GetLevel0TotalSize(), + 120000 * 12 * 0.8 + 120000 * 2); +} +#endif + +TEST(DBTest, ConvertCompactionStyle) { + Random rnd(301); + int max_key_level_insert = 200; + int max_key_universal_insert = 600; + + // Stage 1: generate a db with level compaction + Options options = CurrentOptions(); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_bytes_for_level_base = 500<<10; // 500KB + options.max_bytes_for_level_multiplier = 1; + options.target_file_size_base = 200<<10; // 200KB + options.target_file_size_multiplier = 1; + Reopen(&options); + + for (int i = 0; i <= max_key_level_insert; i++) { + // each value is 10K + ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000))); + } + dbfull()->Flush(FlushOptions()); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(TotalTableFiles(), 1); + int non_level0_num_files = 0; + for (int i = 1; i < dbfull()->NumberLevels(); i++) { + non_level0_num_files += NumTableFilesAtLevel(i); + } + ASSERT_GT(non_level0_num_files, 0); + + // Stage 2: reopen with universal compaction - should fail + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + Status s = TryReopen(&options); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Stage 3: compact into a single file and move the file to level 0 + options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = INT_MAX; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = INT_MAX; + options.max_bytes_for_level_multiplier = 1; + Reopen(&options); + + dbfull()->CompactRange(nullptr, nullptr, + true /* reduce level */, + 0 /* reduce to level 0 */); + + for (int i = 0; i < dbfull()->NumberLevels(); i++) { + int num = NumTableFilesAtLevel(i); + if (i == 0) { + ASSERT_EQ(num, 1); + } else { + ASSERT_EQ(num, 0); + } + } + + // Stage 4: re-open in universal compaction style and do some db operations + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100<<10; //100KB + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000))); + } + dbfull()->Flush(FlushOptions()); + dbfull()->TEST_WaitForCompact(); + + for (int i = 1; i < dbfull()->NumberLevels(); i++) { + ASSERT_EQ(NumTableFilesAtLevel(i), 0); + } + + // verify keys inserted in both level compaction style and universal + // compaction style + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= max_key_universal_insert; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); +} + +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); + + for (int num = 0; + num < options.level0_file_num_compaction_trigger - 1; + num++) + { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(Key(i), values[i])); + } + self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(Key(i), values[i])); + } + self->dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); +} + +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, "Test with compression options : window_bits = %d, level = %d, strategy = %d}\n", wbits, lev, strategy); + options.write_buffer_size = 100<<10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; + + if (SnappyCompressionSupported(CompressionOptions(wbits, lev, strategy))) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (ZlibCompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2CompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4CompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (LZ4HCCompressionSupported( + CompressionOptions(wbits, lev, strategy))) { + type = kLZ4HCCompression; + fprintf(stderr, "using lz4hc\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); + + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; +} + +TEST(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(&options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(&options); + MinLevelHelper(this, options); +} + +TEST(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(&options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(&options); + MinLevelHelper(this, options); +} + +TEST(DBTest, RepeatedWritesToSameKey) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + Reopen(&options); + + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = dbfull()->NumberLevels() + + dbfull()->Level0StopWriteTrigger(); + + Random rnd(301); + std::string value = RandomString(&rnd, 2 * options.write_buffer_size); + for (int i = 0; i < 5 * kMaxFiles; i++) { + Put("key", value); + ASSERT_LE(TotalTableFiles(), kMaxFiles); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdate) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + Reopen(&options); + + // Update key with values of smaller size + int numValues = 10; + for (int i = numValues; i > 0; i--) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put("key", value)); + ASSERT_EQ(value, Get("key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + Reopen(&options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put("key", value)); + ASSERT_EQ(value, Get("key")); + } + + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues); + + } while (ChangeCompactOptions()); +} + + +TEST(DBTest, InPlaceUpdateCallbackSmallerSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceSmallerSize; + Reopen(&options); + + // Update key with values of smaller size + int numValues = 10; + ASSERT_OK(Put("key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get("key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put("key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i - 1, 'b'), Get("key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceSmallerVarintSize; + Reopen(&options); + + // Update key with values of smaller varint size + int numValues = 265; + ASSERT_OK(Put("key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get("key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put("key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(1, 'b'), Get("key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceLargerSize; + Reopen(&options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + ASSERT_OK(Put("key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i, 'c'), Get("key")); + } + + // No inplace updates. All updates are puts with new seq number + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, InPlaceUpdateCallbackNoAction) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + rocksdb::DBTest::updateInPlaceNoAction; + Reopen(&options); + + // Callback function requests no actions from db + ASSERT_OK(Put("key", DummyString(1, 'a'))); + ASSERT_EQ(Get("key"), "NOT_FOUND"); + + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionFilter) { + Options options = CurrentOptions(); + options.max_open_files = -1; + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.compaction_filter_factory = std::make_shared(); + Reopen(&options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + + // Push all files to the highest level L2. Verify that + // the compaction is each level invokes the filter for + // all the keys in that level. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_NE(NumTableFilesAtLevel(2), 0); + cfilter_count = 0; + + // All the files are in the lowest level. + // Verify that all but the 100001st record + // has sequence number zero. The 100001st record + // is at the tip of this snapshot and cannot + // be zeroed out. + // TODO: figure out sequence number squashtoo + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 100000); + ASSERT_EQ(count, 1); + delete iter; + + // overwrite all the 100K keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + + // push all files to the highest level L2. This + // means that all keys should pass at least once + // via the compaction filter + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_NE(NumTableFilesAtLevel(2), 0); + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory = std::make_shared(); + options.create_if_missing = true; + DestroyAndReopen(&options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + ASSERT_NE(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + + // Push all files to the highest level L2. This + // triggers the compaction filter to delete all keys, + // verify that at the end of the compaction process, + // nothing is left. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(cfilter_count, 0); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Scan the entire database to ensure that nothing is left + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + delete iter; + + // The sequence number of the remaining record + // is not zeroed out even though it is at the + // level Lmax because this record is at the tip + // TODO: remove the following or design a different + // test + count = 0; + iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + delete iter; +} + +TEST(DBTest, CompactionFilterWithValueChange) { + do { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.compaction_filter_factory = + std::make_shared(); + Reopen(&options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + + // push all files to lower levels + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + // re-write all data again + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + + // push all files to lower levels. This should + // invoke the compaction filter for all 100000 keys. + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + std::string newvalue = Get(key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactionFilterContextManual) { + KeepFilterFactory* filter = new KeepFilterFactory(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 8; + Reopen(&options); + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + + // Force a manual compaction + cfilter_count = 0; + filter->expect_manual_compaction_.store(true); + filter->expect_full_compaction_.store(false); // Manual compaction always + // set this flag. + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(cfilter_count, 700); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // Verify total number of keys is correct after manual compaction. + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 1); + delete iter; +} + +class KeepFilterV2 : public CompactionFilterV2 { + public: + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const override { + cfilter_count++; + std::vector ret; + new_values->clear(); + values_changed->clear(); + for (unsigned int i = 0; i < keys.size(); ++i) { + values_changed->push_back(false); + ret.push_back(false); + } + return ret; + } + + virtual const char* Name() const override { + return "KeepFilterV2"; + } +}; + +class DeleteFilterV2 : public CompactionFilterV2 { + public: + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const override { + cfilter_count++; + new_values->clear(); + values_changed->clear(); + std::vector ret; + for (unsigned int i = 0; i < keys.size(); ++i) { + values_changed->push_back(false); + ret.push_back(true); + } + return ret; + } + + virtual const char* Name() const override { + return "DeleteFilterV2"; + } +}; + +class ChangeFilterV2 : public CompactionFilterV2 { + public: + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const override { + std::vector ret; + new_values->clear(); + values_changed->clear(); + for (unsigned int i = 0; i < keys.size(); ++i) { + values_changed->push_back(true); + new_values->push_back(NEW_VALUE); + ret.push_back(false); + } + return ret; + } + + virtual const char* Name() const override { + return "ChangeFilterV2"; + } +}; + +class KeepFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit KeepFilterFactoryV2(const SliceTransform* prefix_extractor) + : CompactionFilterFactoryV2(prefix_extractor) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(new KeepFilterV2()); + } + + virtual const char* Name() const override { + return "KeepFilterFactoryV2"; + } +}; + +class DeleteFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit DeleteFilterFactoryV2(const SliceTransform* prefix_extractor) + : CompactionFilterFactoryV2(prefix_extractor) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(new DeleteFilterV2()); + } + + virtual const char* Name() const override { + return "DeleteFilterFactoryV2"; + } +}; + +class ChangeFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit ChangeFilterFactoryV2(const SliceTransform* prefix_extractor) + : CompactionFilterFactoryV2(prefix_extractor) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(new ChangeFilterV2()); + } + + virtual const char* Name() const override { + return "ChangeFilterFactoryV2"; + } +}; + +TEST(DBTest, CompactionFilterV2) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + // extract prefix + std::unique_ptr prefix_extractor; + prefix_extractor.reset(NewFixedPrefixTransform(8)); + + options.compaction_filter_factory_v2 + = std::make_shared(prefix_extractor.get()); + // In a testing environment, we can only flush the application + // compaction filter buffer using universal compaction + option_config_ = kUniversalCompaction; + options.compaction_style = (rocksdb::CompactionStyle)1; + Reopen(&options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i , i); + Put(key, value); + } + + dbfull()->TEST_FlushMemTable(); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // All the files are in the lowest level. + int count = 0; + int total = 0; + Iterator* iter = dbfull()->TEST_NewInternalIterator(); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + + ASSERT_EQ(total, 100000); + // 1 snapshot only. Since we are using universal compacton, + // the sequence no is cleared for better compression + ASSERT_EQ(count, 1); + delete iter; + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory_v2 = + std::make_shared(prefix_extractor.get()); + options.create_if_missing = true; + DestroyAndReopen(&options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i, i); + Put(key, value); + } + + dbfull()->TEST_FlushMemTable(); + ASSERT_NE(NumTableFilesAtLevel(0), 0); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Scan the entire database to ensure that nothing is left + iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + + ASSERT_EQ(count, 0); + delete iter; +} + +TEST(DBTest, CompactionFilterV2WithValueChange) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + std::unique_ptr prefix_extractor; + prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.compaction_filter_factory_v2 = + std::make_shared(prefix_extractor.get()); + // In a testing environment, we can only flush the application + // compaction filter buffer using universal compaction + option_config_ = kUniversalCompaction; + options.compaction_style = (rocksdb::CompactionStyle)1; + Reopen(&options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i, i); + Put(key, value); + } + + // push all files to lower levels + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%010d", i, i); + std::string newvalue = Get(key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } +} + +TEST(DBTest, CompactionFilterV2NULLPrefix) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + std::unique_ptr prefix_extractor; + prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.compaction_filter_factory_v2 = + std::make_shared(prefix_extractor.get()); + // In a testing environment, we can only flush the application + // compaction filter buffer using universal compaction + option_config_ = kUniversalCompaction; + options.compaction_style = (rocksdb::CompactionStyle)1; + Reopen(&options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + char first_key[100]; + snprintf(first_key, sizeof(first_key), "%s0000%010d", "NULL", 1); + Put(first_key, value); + for (int i = 1; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "%08d%010d", i, i); + Put(key, value); + } + + char last_key[100]; + snprintf(last_key, sizeof(last_key), "%s0000%010d", "NULL", 2); + Put(last_key, value); + + // push all files to lower levels + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + // verify that all keys now have the new value that + // was set by the compaction process. + std::string newvalue = Get(first_key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + newvalue = Get(last_key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + for (int i = 1; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "%08d%010d", i, i); + std::string newvalue = Get(key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } +} + +TEST(DBTest, SparseMerge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + Reopen(&options); + + FillLevels("A", "Z"); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put("A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(key, value); + } + Put("C", "vc"); + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + // Make sparse update + Put("A", "va2"); + Put("B100", "bvalue2"); + Put("C", "vc2"); + dbfull()->TEST_FlushMemTable(); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + } while (ChangeCompactOptions()); +} + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST(DBTest, ApproximateSizes) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + DestroyAndReopen(); + + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + Reopen(&options); + ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, S1))); + } + + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i)); + ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10)); + } + ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50)); + ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend); + } + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable)); +} + +TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + Reopen(); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(2), big1)); + ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(4), big1)); + ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + Reopen(&options); + + ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); +} + +TEST(DBTest, IteratorPinsRef) { + do { + Put("foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Write to force compactions + Put("foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + } + Put("foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, Snapshot) { + do { + Put("foo", "v1"); + const Snapshot* s1 = db_->GetSnapshot(); + Put("foo", "v2"); + const Snapshot* s2 = db_->GetSnapshot(); + Put("foo", "v3"); + const Snapshot* s3 = db_->GetSnapshot(); + + Put("foo", "v4"); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v3", Get("foo", s3)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s3); + ASSERT_EQ("v1", Get("foo", s1)); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("v2", Get("foo", s2)); + ASSERT_EQ("v4", Get("foo")); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ("v4", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, HiddenValuesAreRemoved) { + do { + Random rnd(301); + FillLevels("a", "z"); + + std::string big = RandomString(&rnd, 50000); + Put("foo", big); + Put("pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "tiny"); + Put("pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + + ASSERT_EQ(big, Get("foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, nullptr, &x); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GE(NumTableFilesAtLevel(1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x); + ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable)); +} + +TEST(DBTest, CompactBetweenSnapshots) { + do { + Random rnd(301); + FillLevels("a", "z"); + + Put("foo", "first"); + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put("foo", "second"); + Put("foo", "third"); + Put("foo", "fourth"); + const Snapshot* snapshot2 = db_->GetSnapshot(); + Put("foo", "fifth"); + Put("foo", "sixth"); + + // All entries (including duplicates) exist + // before any compaction is triggered. + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ("fourth", Get("foo", snapshot2)); + ASSERT_EQ("first", Get("foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo"), + "[ sixth, fifth, fourth, third, second, first ]"); + + // After a compaction, "second", "third" and "fifth" should + // be removed + FillLevels("a", "z"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ("fourth", Get("foo", snapshot2)); + ASSERT_EQ("first", Get("foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth, first ]"); + + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z"); + dbfull()->CompactRange(nullptr, nullptr); + + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ("fourth", Get("foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ("sixth", Get("foo")); + ASSERT_EQ(AllEntriesFor("foo"), "[ sixth ]"); + + } while (ChangeOptions()); +} + +TEST(DBTest, DeletionMarkers1) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + + // Place a table at level last-1 to prevent merging with preceding mutation + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); + ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Moves to level last-2 + if (CurrentOptions().purge_redundant_kvs_while_flush) { + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + } else { + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + } + Slice z("z"); + dbfull()->TEST_CompactRange(last-2, nullptr, &z); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last-1, nullptr, nullptr); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); +} + +TEST(DBTest, DeletionMarkers2) { + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + + // Place a table at level last-1 to prevent merging with preceding mutation + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); + ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last-2, nullptr, nullptr); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last-1, nullptr, nullptr); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); +} + +TEST(DBTest, OverlapInLevel0) { + do { + int tmp = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(tmp, 2) << "Fix test to match config"; + + //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. + ASSERT_OK(Put("100", "v100")); + ASSERT_OK(Put("999", "v999")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Delete("100")); + ASSERT_OK(Delete("999")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("0,1,1", FilesPerLevel()); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put("300", "v300")); + ASSERT_OK(Put("500", "v500")); + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put("200", "v200")); + ASSERT_OK(Put("600", "v600")); + ASSERT_OK(Put("900", "v900")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("2,1,1", FilesPerLevel()); + + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + dbfull()->TEST_CompactRange(2, nullptr, nullptr); + ASSERT_EQ("2", FilesPerLevel()); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete("600")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("NOT_FOUND", Get("600")); + } while (ChangeOptions(kSkipUniversalCompaction)); +} + +TEST(DBTest, L0_CompactionBug_Issue44_a) { + do { + Reopen(); + ASSERT_OK(Put("b", "v")); + Reopen(); + ASSERT_OK(Delete("b")); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Delete("a")); + Reopen(); + ASSERT_OK(Put("a", "v")); + Reopen(); + Reopen(); + ASSERT_EQ("(a->v)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, L0_CompactionBug_Issue44_b) { + do { + Reopen(); + Put("",""); + Reopen(); + Delete("e"); + Put("",""); + Reopen(); + Put("c", "cv"); + Reopen(); + Put("",""); + Reopen(); + Put("",""); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + Reopen(); + Put("d","dv"); + Reopen(); + Put("",""); + Reopen(); + Delete("d"); + Delete("b"); + Reopen(); + ASSERT_EQ("(->)(c->cv)", Contents()); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "rocksdb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options new_options; + NewComparator cmp; + do { + new_options = CurrentOptions(); + new_options.comparator = &cmp; + Status s = TryReopen(&new_options); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); + } while (ChangeCompactOptions(&new_options)); +} + +TEST(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + virtual const char* Name() const { return "test.NumberComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return ToNumber(a) - ToNumber(b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + virtual void FindShortSuccessor(std::string* key) const { + ToNumber(*key); // Check format + } + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']') + << EscapeString(x); + int val; + char ignored; + ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.filter_policy = nullptr; // Cannot use bloom filters + new_options.write_buffer_size = 1000; // Compact more often + DestroyAndReopen(&new_options); + ASSERT_OK(Put("[10]", "ten")); + ASSERT_OK(Put("[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get("[10]")); + ASSERT_EQ("ten", Get("[0xa]")); + ASSERT_EQ("twenty", Get("[20]")); + ASSERT_EQ("twenty", Get("[0x14]")); + ASSERT_EQ("NOT_FOUND", Get("[15]")); + ASSERT_EQ("NOT_FOUND", Get("[0xf]")); + Compact("[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i*10); + ASSERT_OK(Put(buf, buf)); + } + Compact("[0]", "[1000000]"); + } + } while (ChangeCompactOptions(&new_options)); +} + +TEST(DBTest, ManualCompaction) { + ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) + << "Need to update this test to match kMaxMemCompactLevel"; + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls before files + Compact("", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls after files + Compact("r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range overlaps files + Compact("p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + // Populate a different range + MakeTables(3, "c", "e"); + ASSERT_EQ("1,1,2", FilesPerLevel()); + + // Compact just the new range + Compact("b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel()); + + // Compact all + MakeTables(1, "a", "z"); + ASSERT_EQ("0,1,2", FilesPerLevel()); + db_->CompactRange(nullptr, nullptr); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + if (iter == 0) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(&options); + } + } + +} + +TEST(DBTest, DBOpen_Options) { + std::string dbname = test::TmpDir() + "/db_options_test"; + ASSERT_OK(DestroyDB(dbname, Options())); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + Options opts; + opts.create_if_missing = false; + Status s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + opts.create_if_missing = false; + opts.error_if_exists = true; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + opts.create_if_missing = true; + opts.error_if_exists = false; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST(DBTest, DBOpen_Change_NumLevels) { + std::string dbname = test::TmpDir() + "/db_change_num_levels"; + ASSERT_OK(DestroyDB(dbname, Options())); + Options opts; + Status s; + DB* db = nullptr; + opts.create_if_missing = true; + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + db->Put(WriteOptions(), "a", "123"); + db->Put(WriteOptions(), "b", "234"); + db->CompactRange(nullptr, nullptr); + delete db; + db = nullptr; + + opts.create_if_missing = false; + opts.num_levels = 2; + s = DB::Open(opts, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db == nullptr); +} + +TEST(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::TmpDir() + "/db_meta"; + std::string metadbname = MetaDatabaseName(dbname, 0); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + + // Destroy previous versions if they exist. Using the long way. + ASSERT_OK(DestroyDB(metametadbname, Options())); + ASSERT_OK(DestroyDB(metadbname, Options())); + ASSERT_OK(DestroyDB(dbname, Options())); + + // Setup databases + Options opts; + opts.create_if_missing = true; + DB* db = nullptr; + ASSERT_OK(DB::Open(opts, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(opts, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(opts, metametadbname, &db)); + delete db; + db = nullptr; + + // Delete databases + ASSERT_OK(DestroyDB(dbname, Options())); + + // Check if deletion worked. + opts.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok()); +} + +// Check that number of files does not grow when we are out of space +TEST(DBTest, NoSpace) { + do { + Options options = CurrentOptions(); + options.env = env_; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const int num_files = CountFiles(); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + env_->sleep_counter_.Reset(); + for (int i = 0; i < 5; i++) { + for (int level = 0; level < dbfull()->NumberLevels()-1; level++) { + dbfull()->TEST_CompactRange(level, nullptr, nullptr); + } + } + + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); + + env_->no_space_.Release_Store(nullptr); + ASSERT_LT(CountFiles(), num_files + 3); + + // Check that compaction attempts slept after errors + ASSERT_GE(env_->sleep_counter_.Read(), 5); + } while (ChangeCompactOptions()); +} + +// Check background error counter bumped on flush failures. +TEST(DBTest, NoSpaceFlush) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.max_background_flushes = 1; + Reopen(&options); + + ASSERT_OK(Put("foo", "v1")); + env_->no_space_.Release_Store(env_); // Force out-of-space errors + + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); + + dbfull()->TEST_FlushMemTable(false); + + // Wait 300 milliseconds or background-errors turned 1 from 0. + int time_to_sleep_limit = 300000; + while (time_to_sleep_limit > 0) { + int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit; + time_to_sleep_limit -= to_sleep; + env_->SleepForMicroseconds(to_sleep); + + ASSERT_TRUE( + db_->GetProperty("rocksdb.background-errors", &property_value)); + if (property_value == "1") { + break; + } + } + ASSERT_EQ("1", property_value); + + env_->no_space_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, NonWritableFileSystem) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 1000; + options.env = env_; + Reopen(&options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writable_.Release_Store(env_); // Force errors for new files + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writable_.Release_Store(nullptr); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record + + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + port::AtomicPointer* error_type = (iter == 0) + ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; + + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + DestroyAndReopen(&options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Memtable compaction (will succeed) + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("bar", Get("foo")); + const int last = dbfull()->MaxMemCompactionLevel(); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level + + // Merging compaction (will fail) + error_type->Release_Store(env_); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + // Recovery: should not lose data + error_type->Release_Store(nullptr); + Reopen(&options); + ASSERT_EQ("bar", Get("foo")); + } +} + +TEST(DBTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(&options); + Status s; + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + // simulate error + env_->log_write_error_.Release_Store(env_); + s = Put("foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.Release_Store(nullptr); + s = Put("foo3", "bar3"); + // the next put should fail, too + ASSERT_TRUE(!s.ok()); + // but we're still able to read + ASSERT_EQ("bar", Get("foo")); + + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(&options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + // simulate error + env_->log_write_error_.Release_Store(env_); + s = Put("foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.Release_Store(nullptr); + s = Put("foo3", "bar3"); + // the next put should NOT fail + ASSERT_TRUE(s.ok()); +} + +TEST(DBTest, FilesDeletedAfterCompaction) { + do { + ASSERT_OK(Put("foo", "v2")); + Compact("a", "z"); + const int num_files = CountLiveFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("foo", "v2")); + Compact("a", "z"); + } + ASSERT_EQ(CountLiveFiles(), num_files); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, BloomFilter) { + do { + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.no_block_cache = true; + options.filter_policy = NewBloomFilterPolicy(10); + Reopen(&options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + Compact("a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(Key(i), Key(i))); + } + dbfull()->TEST_FlushMemTable(); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.Release_Store(env_); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + ASSERT_LE(reads, N + 2*N/100); + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + ASSERT_LE(reads, 3*N/100); + + env_->delay_sstable_sync_.Release_Store(nullptr); + Close(); + delete options.filter_policy; + } while (ChangeCompactOptions()); +} + +TEST(DBTest, SnapshotFiles) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + Reopen(&options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(files, &manifest_size); + + // CURRENT, MANIFEST, *.sst files + ASSERT_EQ(files.size(), 3U); + + uint64_t number = 0; + FileType type; + + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + std::string mkdir = "mkdir -p " + snapdir; + ASSERT_EQ(system(mkdir.c_str()), 0); + + for (unsigned int i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; + + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); + + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > manifest_number) { + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + } + CopyFile(src, dest, size); + } + + // release file snapshot + dbfull()->DisableFileDeletions(); + + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(Key(i), extras[i])); + } + + // verify that data in the snapshot are correct + Options opts; + DB* snapdb; + opts.create_if_missing = false; + Status stat = DB::Open(opts, snapdir, &snapdb); + ASSERT_OK(stat); + + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + stat = snapdb->Get(roptions, Key(i), &val); + ASSERT_EQ(values[i].compare(val), 0); + } + delete snapdb; + + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(newfiles, &new_manifest_size); + + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (unsigned int i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > new_manifest_number) { + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); + + // release file snapshot + dbfull()->DisableFileDeletions(); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, CompactOnFlush) { + do { + Options options = CurrentOptions(); + options.purge_redundant_kvs_while_flush = true; + options.disable_auto_compactions = true; + Reopen(&options); + + Put("foo", "v1"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v1 ]"); + + // Write two new keys + Put("a", "begin"); + Put("z", "end"); + dbfull()->TEST_FlushMemTable(); + + // Case1: Delete followed by a put + Delete("foo"); + Put("foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); + + // Case 2: Delete followed by another delete + Delete("foo"); + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, DEL, v2 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v2 ]"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 3: Put followed by a delete + Put("foo", "v3"); + Delete("foo"); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v3 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ DEL ]"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 4: Put followed by another Put + Put("foo", "v4"); + Put("foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo"), "[ v5, v4 ]"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]"); + + // clear database + Delete("foo"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + Put("foo", "v6"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put("foo", "v7"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); + + // clear database + Delete("foo"); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put("foo", "v8"); + Put("foo", "v9"); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ(AllEntriesFor("foo"), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); +} + +std::vector ListLogFiles(Env* env, const std::string& path) { + std::vector files; + std::vector log_files; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kLogFile) { + log_files.push_back(number); + } + } + } + return std::move(log_files); +} + +TEST(DBTest, WALArchivalTtl) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + DestroyAndReopen(&options); + + // TEST : Create DB with a ttl and no size limit. + // Put some keys. Count the log files present in the DB just after insert. + // Re-open db. Causes deletion/archival to take place. + // Assert that the files moved under "/archive". + // Reopen db with small ttl. + // Assert that archive was removed. + + std::string archiveDir = ArchivalDirectory(dbname_); + + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 10; ++j) { + ASSERT_OK(Put(Key(10 * i + j), DummyString(1024))); + } + + std::vector log_files = ListLogFiles(env_, dbname_); + + options.create_if_missing = false; + Reopen(&options); + + std::vector logs = ListLogFiles(env_, archiveDir); + std::set archivedFiles(logs.begin(), logs.end()); + + for (auto& log : log_files) { + ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end()); + } + } + + std::vector log_files = ListLogFiles(env_, archiveDir); + ASSERT_TRUE(log_files.size() > 0); + + options.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(&options); + + log_files = ListLogFiles(env_, archiveDir); + ASSERT_TRUE(log_files.empty()); + } while (ChangeCompactOptions()); +} + +uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) { + uint64_t dir_size = 0; + std::vector files; + env->GetChildren(dir_path, &files); + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = dir_path + "/" + f; + uint64_t file_size; + env->GetFileSize(file_path, &file_size); + dir_size += file_size; + } + } + return dir_size; +} + +TEST(DBTest, WALArchivalSizeLimit) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 0; + options.WAL_size_limit_MB = 1000; + + // TEST : Create DB with huge size limit and no ttl. + // Put some keys. Count the archived log files present in the DB + // just after insert. Assert that there are many enough. + // Change size limit. Re-open db. + // Assert that archive is not greater than WAL_size_limit_MB. + // Set ttl and time_to_check_ to small values. Re-open db. + // Assert that there are no archived logs left. + + DestroyAndReopen(&options); + for (int i = 0; i < 128 * 128; ++i) { + ASSERT_OK(Put(Key(i), DummyString(1024))); + } + Reopen(&options); + + std::string archive_dir = ArchivalDirectory(dbname_); + std::vector log_files = ListLogFiles(env_, archive_dir); + ASSERT_TRUE(log_files.size() > 2); + + options.WAL_size_limit_MB = 8; + Reopen(&options); + dbfull()->TEST_PurgeObsoleteteWAL(); + + uint64_t archive_size = GetLogDirSize(archive_dir, env_); + ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024); + + options.WAL_ttl_seconds = 1; + dbfull()->TEST_SetDefaultTimeToCheck(1); + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(&options); + dbfull()->TEST_PurgeObsoleteteWAL(); + + log_files = ListLogFiles(env_, archive_dir); + ASSERT_TRUE(log_files.empty()); + } while (ChangeCompactOptions()); +} + +SequenceNumber ReadRecords( + std::unique_ptr& iter, + int& count) { + count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + ASSERT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + ASSERT_OK(iter->status()); + iter->Next(); + } + return res.sequence; +} + +void ExpectRecords( + const int expected_no_records, + std::unique_ptr& iter) { + int num_records; + ReadRecords(iter, num_records); + ASSERT_EQ(num_records, expected_no_records); +} + +TEST(DBTest, TransactionLogIterator) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + Put("key2", DummyString(1024)); + Put("key2", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(3, iter); + } + Reopen(&options); + env_->SleepForMicroseconds(2 * 1000 * 1000);{ + Put("key4", DummyString(1024)); + Put("key5", DummyString(1024)); + Put("key6", DummyString(1024)); + } + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(6, iter); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorRace) { + // Setup sync point dependency to reproduce the race condition of + // a log file moved to archived dir, in the middle of GetSortedWalFiles + rocksdb::SyncPoint::GetInstance()->LoadDependency( + { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1" }, + { "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" }, + }); + + do { + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key2", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key3", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key4", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); + + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(4, iter); + } + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // trigger async flush, and log move. Well, log move will + // wait until the GetSortedWalFiles:1 to reproduce the race + // condition + FlushOptions flush_options; + flush_options.wait = false; + dbfull()->Flush(flush_options); + + // "key5" would be written in a new memtable and log + Put("key5", DummyString(1024)); + { + // this iter would miss "key4" if not fixed + auto iter = OpenTransactionLogIter(0); + ExpectRecords(5, iter); + } + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + // Do a plain Reopen. + Put("key1", DummyString(1024)); + // Two reopens should create a zero record WAL file. + Reopen(&options); + Reopen(&options); + + Put("key2", DummyString(1024)); + + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +// TODO(kailiu) disable the in non-linux platforms to temporarily solve +// // the unit test failure. +#ifdef OS_LINUX +TEST(DBTest, TransactionLogIteratorStallAtLastRecord) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + auto iter = OpenTransactionLogIter(0); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + Put("key2", DummyString(1024)); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + } while (ChangeCompactOptions()); +} +#endif + +TEST(DBTest, TransactionLogIteratorJustEmptyFile) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(0, &iter); + // Check that an empty iterator is returned + ASSERT_TRUE(!iter->Valid()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorCheckAfterRestart) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + Put("key2", DummyString(1023)); + dbfull()->Flush(FlushOptions()); + Reopen(&options); + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorCorruptedLog) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + for (int i = 0; i < 1024; i++) { + Put("key"+std::to_string(i), DummyString(10)); + } + dbfull()->Flush(FlushOptions()); + // Corrupt this log to create a gap + rocksdb::VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName(); + ASSERT_EQ( + 0, + truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2)); + // Insert a new entry to a new log file + Put("key1025", DummyString(10)); + // Try to read from the beginning. Should stop before the gap and read less + // than 1025 entries + auto iter = OpenTransactionLogIter(0); + int count; + int last_sequence_read = ReadRecords(iter, count); + ASSERT_LT(last_sequence_read, 1025); + // Try to read past the gap, should be able to seek to key1025 + auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); + ExpectRecords(1, iter2); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorBatchOperations) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + WriteBatch batch; + batch.Put("key1", DummyString(1024)); + batch.Put("key2", DummyString(1024)); + batch.Put("key3", DummyString(1024)); + batch.Delete("key2"); + dbfull()->Write(WriteOptions(), &batch); + dbfull()->Flush(FlushOptions()); + Reopen(&options); + Put("key4", DummyString(1024)); + auto iter = OpenTransactionLogIter(3); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, TransactionLogIteratorBlobs) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + { + WriteBatch batch; + batch.Put("key1", DummyString(1024)); + batch.Put("key2", DummyString(1024)); + batch.PutLogData(Slice("blob1")); + batch.Put("key3", DummyString(1024)); + batch.PutLogData(Slice("blob2")); + batch.Delete("key2"); + dbfull()->Write(WriteOptions(), &batch); + Reopen(&options); + } + + auto res = OpenTransactionLogIter(0)->GetBatch(); + struct Handler : public WriteBatch::Handler { + std::string seen; + virtual void Put(const Slice& key, const Slice& value) { + seen += "Put(" + key.ToString() + ", " + std::to_string(value.size()) + + ")"; + } + virtual void Merge(const Slice& key, const Slice& value) { + seen += "Merge(" + key.ToString() + ", " + std::to_string(value.size()) + + ")"; + } + virtual void LogData(const Slice& blob) { + seen += "LogData(" + blob.ToString() + ")"; + } + virtual void Delete(const Slice& key) { + seen += "Delete(" + key.ToString() + ")"; + } + } handler; + res.writeBatchPtr->Iterate(&handler); + ASSERT_EQ("Put(key1, 1024)" + "Put(key2, 1024)" + "LogData(blob1)" + "Put(key3, 1024)" + "LogData(blob2)" + "Delete(key2)", handler.seen); +} + +TEST(DBTest, ReadCompaction) { + std::string value(4096, '4'); // a string of size 4K + { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_open_files = 20; // only 10 file in file-cache + options.target_file_size_base = 512; + options.write_buffer_size = 64 * 1024; + options.filter_policy = nullptr; + options.block_size = 4096; + options.no_block_cache = true; + options.disable_seek_compaction = false; + + Reopen(&options); + + // Write 8MB (2000 values, each 4K) + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + std::vector values; + for (int i = 0; i < 2000; i++) { + ASSERT_OK(Put(Key(i), value)); + } + + // clear level 0 and 1 if necessary. + dbfull()->TEST_FlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // write some new keys into level 0 + for (int i = 0; i < 2000; i = i + 16) { + ASSERT_OK(Put(Key(i), value)); + } + dbfull()->Flush(FlushOptions()); + + // Wait for any write compaction to finish + dbfull()->TEST_WaitForCompact(); + + // remember number of files in each level + int l1 = NumTableFilesAtLevel(0); + int l2 = NumTableFilesAtLevel(1); + int l3 = NumTableFilesAtLevel(3); + ASSERT_NE(NumTableFilesAtLevel(0), 0); + ASSERT_NE(NumTableFilesAtLevel(1), 0); + ASSERT_NE(NumTableFilesAtLevel(2), 0); + + // read a bunch of times, trigger read compaction + for (int j = 0; j < 100; j++) { + for (int i = 0; i < 2000; i++) { + Get(Key(i)); + } + } + // wait for read compaction to finish + env_->SleepForMicroseconds(1000000); + + // verify that the number of files have decreased + // in some level, indicating that there was a compaction + ASSERT_TRUE(NumTableFilesAtLevel(0) < l1 || + NumTableFilesAtLevel(1) < l2 || + NumTableFilesAtLevel(2) < l3); + } +} + +// Multi-threaded test: +namespace { + +static const int kNumThreads = 4; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; + +struct MTState { + DBTest* test; + port::AtomicPointer stop; + port::AtomicPointer counter[kNumThreads]; + port::AtomicPointer thread_done[kNumThreads]; +}; + +struct MTThread { + MTState* state; + int id; +}; + +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + uintptr_t counter = 0; + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + std::string value; + char valbuf[1500]; + while (t->state->stop.Acquire_Load() == nullptr) { + t->state->counter[id].Release_Store(reinterpret_cast(counter)); + + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); + + if (rnd.OneIn(2)) { + // Write values of the form . + // We add some padding for force compactions. + snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", + key, id, static_cast(counter)); + ASSERT_OK(t->state->test->Put(Slice(keybuf), Slice(valbuf))); + } else { + // Read a value and verify that it matches the pattern written above. + Status s = db->Get(ReadOptions(), Slice(keybuf), &value); + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int k, w, c; + ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE((unsigned int)c, reinterpret_cast( + t->state->counter[w].Acquire_Load())); + } + } + counter++; + } + t->state->thread_done[id].Release_Store(t); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); +} + +} // namespace + +TEST(DBTest, MultiThreaded) { + do { + // Initialize state + MTState mt; + mt.test = this; + mt.stop.Release_Store(0); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].Release_Store(0); + mt.thread_done[id].Release_Store(0); + } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + env_->StartThread(MTThreadBody, &thread[id]); + } + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.Release_Store(&mt); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].Acquire_Load() == nullptr) { + env_->SleepForMicroseconds(100000); + } + } + } while (ChangeOptions()); +} + +// Group commit test: +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(std::to_string(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // namespace + +TEST(DBTest, GroupCommitTest) { + do { + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); + } + } + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(std::to_string(i)); + } + sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + } while (ChangeOptions()); +} + +namespace { +typedef std::map KVMap; +} + +class ModelDB: public DB { + public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; + }; + + explicit ModelDB(const Options& options): options_(options) { } + virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Put(o, k, v); + } + virtual Status Merge(const WriteOptions& o, const Slice& k, const Slice& v) { + return DB::Merge(o, k, v); + } + virtual Status Delete(const WriteOptions& o, const Slice& key) { + return DB::Delete(o, key); + } + virtual Status Get(const ReadOptions& options, + const Slice& key, std::string* value) { + return Status::NotSupported(key); + } + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return Status(); + } + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; // Not Supported directly + } + virtual Iterator* NewIterator(const ReadOptions& options) { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); + } + } + virtual const Snapshot* GetSnapshot() { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) { + delete reinterpret_cast(snapshot); + } + virtual Status Write(const WriteOptions& options, WriteBatch* batch) { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + virtual void Put(const Slice& key, const Slice& value) { + (*map_)[key.ToString()] = value.ToString(); + } + virtual void Merge(const Slice& key, const Slice& value) { + // ignore merge for now + //(*map_)[key.ToString()] = value.ToString(); + } + virtual void Delete(const Slice& key) { + map_->erase(key.ToString()); + } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); + } + + virtual bool GetProperty(const Slice& property, std::string* value) { + return false; + } + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + } + virtual Status CompactRange(const Slice* start, const Slice* end, + bool reduce_level, int target_level) { + return Status::NotSupported("Not supported operation."); + } + + virtual int NumberLevels() + { + return 1; + } + + virtual int MaxMemCompactionLevel() + { + return 1; + } + + virtual int Level0StopWriteTrigger() + { + return -1; + } + + virtual const std::string& GetName() const { + return name_; + } + + virtual Env* GetEnv() const { + return nullptr; + } + + virtual const Options& GetOptions() const { + return options_; + } + + virtual Status Flush(const rocksdb::FlushOptions& options) { + Status ret; + return ret; + } + + virtual Status DisableFileDeletions() { + return Status::OK(); + } + virtual Status EnableFileDeletions(bool force) { + return Status::OK(); + } + virtual Status GetLiveFiles(std::vector&, uint64_t* size, + bool flush_memtable = true) { + return Status::OK(); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) { + return Status::OK(); + } + + virtual Status DeleteFile(std::string name) { + return Status::OK(); + } + + virtual Status GetDbIdentity(std::string& identity) { + return Status::OK(); + } + + virtual SequenceNumber GetLatestSequenceNumber() const { + return 0; + } + virtual Status GetUpdatesSince( + rocksdb::SequenceNumber, unique_ptr*, + const TransactionLogIterator::ReadOptions& + read_options = TransactionLogIterator::ReadOptions()) { + return Status::NotSupported("Not supported in Model DB"); + } + + private: + class ModelIter: public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) { + } + ~ModelIter() { + if (owned_) delete map_; + } + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { + iter_ = map_->lower_bound(k.ToString()); + } + virtual void Next() { ++iter_; } + virtual void Prev() { --iter_; } + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; +}; + +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, + DB* model, + DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); + miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, + EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + delete miter; + delete dbiter; + return ok; +} + +TEST(DBTest, Randomized) { + Random rnd(test::RandomSeed()); + do { + ModelDB model(CurrentOptions()); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList || + option_config_ == kHashLinkList || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex || + option_config_ == kBlockBasedTableWithPrefixHashIndex) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = RandomString(&rnd, + rnd.OneIn(20) + ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + + + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + // For DB instances that use the hash index + block-based table, the + // iterator will be invalid right when seeking a non-existent key, right + // than return a key that is close to it. + if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && + option_config_ != kBlockBasedTableWithPrefixHashIndex) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + } + + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + + Reopen(); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + } while (ChangeOptions(kSkipDeletesFilterFirst)); +} + +TEST(DBTest, MultiGetSimple) { + do { + ASSERT_OK(db_->Put(WriteOptions(),"k1","v1")); + ASSERT_OK(db_->Put(WriteOptions(),"k2","v2")); + ASSERT_OK(db_->Put(WriteOptions(),"k3","v3")); + ASSERT_OK(db_->Put(WriteOptions(),"k4","v4")); + ASSERT_OK(db_->Delete(WriteOptions(),"k4")); + ASSERT_OK(db_->Put(WriteOptions(),"k5","v5")); + ASSERT_OK(db_->Delete(WriteOptions(),"no_key")); + + std::vector keys(6); + keys[0] = "k1"; + keys[1] = "k2"; + keys[2] = "k3"; + keys[3] = "k4"; + keys[4] = "k5"; + keys[5] = "no_key"; + + std::vector values(20,"Temporary data to be overwritten"); + + std::vector s = db_->MultiGet(ReadOptions(),keys,&values); + ASSERT_EQ(values.size(),keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + } while (ChangeCompactOptions()); +} + +TEST(DBTest, MultiGetEmpty) { + do { + // Empty Key Set + std::vector keys; + std::vector values; + std::vector s = db_->MultiGet(ReadOptions(),keys,&values); + ASSERT_EQ((int)s.size(),0); + + // Empty Database, Empty Key Set + DestroyAndReopen(); + s = db_->MultiGet(ReadOptions(), keys, &values); + ASSERT_EQ((int)s.size(),0); + + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + s = db_->MultiGet(ReadOptions(),keys,&values); + ASSERT_EQ((int)s.size(), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); +} + +void PrefixScanInit(DBTest *dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_FlushMemTable(); + dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_FlushMemTable(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + std::string keystr; + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", + small_range_sstfiles+i+1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->dbfull()->TEST_FlushMemTable(); + } +} + +TEST(DBTest, PrefixScan) { + ReadOptions ro = ReadOptions(); + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.no_block_cache = true; + options.filter_policy = NewBloomFilterPolicy(10); + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.whole_key_filtering = false; + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.disable_seek_compaction = true; + // Tricky: options.prefix_extractor will be released by + // NewHashSkipListRepFactory after use. + options.memtable_factory.reset(NewHashSkipListRepFactory()); + + // prefix specified, with blooms: 2 RAND I/Os + // SeekToFirst + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // prefix specified, with blooms: 2 RAND I/Os + // Seek + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + ro.prefix = &prefix; + iter = db_->NewIterator(ro); + for (iter->Seek(key); iter->Valid(); iter->Next()) { + assert(iter->key().starts_with(prefix)); + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + + // no prefix specified: 11 RAND I/Os + DestroyAndReopen(&options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 11); + Close(); + delete options.filter_policy; +} + +std::string MakeKey(unsigned int num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%016u", num); + return std::string(buf); +} + +void BM_LogAndApply(int iters, int num_base_files) { + std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark"; + ASSERT_OK(DestroyDB(dbname, Options())); + + DB* db = nullptr; + Options opts; + opts.create_if_missing = true; + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + Env* env = Env::Default(); + + port::Mutex mu; + MutexLock l(&mu); + + InternalKeyComparator cmp(BytewiseComparator()); + Options options; + EnvOptions sopt; + VersionSet vset(dbname, &options, sopt, nullptr, &cmp); + ASSERT_OK(vset.Recover()); + VersionEdit vbase; + uint64_t fnum = 1; + for (int i = 0; i < num_base_files; i++) { + InternalKey start(MakeKey(2*fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); + vbase.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1); + } + ASSERT_OK(vset.LogAndApply(&vbase, &mu)); + + uint64_t start_micros = env->NowMicros(); + + for (int i = 0; i < iters; i++) { + VersionEdit vedit; + vedit.DeleteFile(2, fnum); + InternalKey start(MakeKey(2*fnum), 1, kTypeValue); + InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); + vedit.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1); + vset.LogAndApply(&vedit, &mu); + } + uint64_t stop_micros = env->NowMicros(); + unsigned int us = stop_micros - start_micros; + char buf[16]; + snprintf(buf, sizeof(buf), "%d", num_base_files); + fprintf(stderr, + "BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n", + buf, iters, us, ((float)us) / iters); +} + +TEST(DBTest, TailingIteratorSingle) { + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + ASSERT_TRUE(!iter->Valid()); + + // add a record and check that iter can see it + ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "mirko"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST(DBTest, TailingIteratorKeepAdding) { + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + std::string value(1024, 'a'); + + const int num_records = 10000; + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%016d", i); + + Slice key(buf, 16); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST(DBTest, TailingIteratorDeletes) { + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + // write a single record, read it using the iterator, then delete it + ASSERT_OK(db_->Put(WriteOptions(), "0test", "test")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0test"); + ASSERT_OK(db_->Delete(WriteOptions(), "0test")); + + // write many more records + const int num_records = 10000; + std::string value(1024, 'A'); + + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "1%015d", i); + + Slice key(buf, 16); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + } + + // force a flush to make sure that no records are read from memtable + dbfull()->TEST_FlushMemTable(); + + // skip "0test" + iter->Next(); + + // make sure we can read all new records using the existing iterator + int count = 0; + for (; iter->Valid(); iter->Next(), ++count) ; + + ASSERT_EQ(count, num_records); +} + +TEST(DBTest, TailingIteratorPrefixSeek) { + ReadOptions read_options; + read_options.tailing = true; + read_options.prefix_seek = true; + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); + DestroyAndReopen(&options); + + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(db_->Put(WriteOptions(), "0101", "test")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(db_->Put(WriteOptions(), "0202", "test")); + + // Seek(0102) shouldn't find any records since 0202 has a different prefix + iter->Seek("0102"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("0202"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0202"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + + +} // namespace rocksdb + +int main(int argc, char** argv) { + if (argc > 1 && std::string(argv[1]) == "--benchmark") { + rocksdb::BM_LogAndApply(1000, 1); + rocksdb::BM_LogAndApply(1000, 100); + rocksdb::BM_LogAndApply(1000, 10000); + rocksdb::BM_LogAndApply(100, 100000); + return 0; + } + + return rocksdb::test::RunAllTests(); +} diff --git a/db/dbformat.cc b/db/dbformat.cc new file mode 100644 index 00000000..2d35d042 --- /dev/null +++ b/db/dbformat.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/dbformat.h" + +#include +#include "port/port.h" +#include "util/coding.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(t <= kValueTypeForSeek); + return (seq << 8) | t; +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +std::string ParsedInternalKey::DebugString(bool hex) const { + char buf[50]; + snprintf(buf, sizeof(buf), "' @ %llu : %d", + (unsigned long long) sequence, + int(type)); + std::string result = "'"; + result += user_key.ToString(hex); + result += buf; + return result; +} + +std::string InternalKey::DebugString(bool hex) const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed)) { + result = parsed.DebugString(hex); + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + +const char* InternalKeyComparator::Name() const { + return name_.c_str(); +} + +int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + BumpPerfCount(&perf_context.user_key_comparison_count); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_->Compare(a.user_key, b.user_key); + BumpPerfCount(&perf_context.user_key_comparison_count); + if (r == 0) { + if (a.sequence > b.sequence) { + r = -1; + } else if (a.sequence < b.sequence) { + r = +1; + } else if (a.type > b.type) { + r = -1; + } else if (a.type < b.type) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_->FindShortestSeparator(&tmp, user_limit); + if (tmp.size() < user_start.size() && + user_comparator_->Compare(user_start, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_->FindShortSuccessor(&tmp); + if (tmp.size() < user_key.size() && + user_comparator_->Compare(user_key, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +const char* InternalFilterPolicy::Name() const { + return user_policy_->Name(); +} + +void InternalFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We rely on the fact that the code in table.cc does not mind us + // adjusting keys[]. + Slice* mkey = const_cast(keys); + for (int i = 0; i < n; i++) { + mkey[i] = ExtractUserKey(keys[i]); + // TODO(sanjay): Suppress dups? + } + user_policy_->CreateFilter(keys, n, dst); +} + +bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { + return user_policy_->KeyMayMatch(ExtractUserKey(key), f); +} + +LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { + size_t usize = user_key.size(); + size_t needed = usize + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + dst = EncodeVarint32(dst, usize + 8); + kstart_ = dst; + memcpy(dst, user_key.data(), usize); + dst += usize; + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + +} // namespace rocksdb diff --git a/db/dbformat.h b/db/dbformat.h new file mode 100644 index 00000000..2bddd3f5 --- /dev/null +++ b/db/dbformat.h @@ -0,0 +1,335 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +// The highest bit of the value type needs to be reserved to SST tables +// for them to do more flexible encoding. +enum ValueType : unsigned char { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeMerge = 0x2, + kTypeLogData = 0x3, + kMaxValue = 0x7F +}; + +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +static const ValueType kValueTypeForSeek = kTypeMerge; + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = + ((0x1ull << 56) - 1); + +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() { } // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) { } + std::string DebugString(bool hex = false) const; +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t); + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator : public Comparator { + private: + const Comparator* user_comparator_; + std::string name_; + public: + explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c), + name_("rocksdb.InternalKeyComparator:" + + std::string(user_comparator_->Name())) { + } + virtual ~InternalKeyComparator() {} + + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const InternalKey& a, const InternalKey& b) const; + int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; +}; + +// Filter policy wrapper that converts from internal keys to user keys +class InternalFilterPolicy : public FilterPolicy { + private: + const FilterPolicy* const user_policy_; + public: + explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } + virtual const char* Name() const; + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; +}; + +// Modules in this directory should keep internal keys wrapped inside +// the following class instead of plain strings so that we do not +// incorrectly use string comparisons instead of an InternalKeyComparator. +class InternalKey { + private: + std::string rep_; + public: + InternalKey() { } // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } + + std::string DebugString(bool hex = false) const; +}; + +inline int InternalKeyComparator::Compare( + const InternalKey& a, const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + assert(result->type <= ValueType::kMaxValue); + result->user_key = Slice(internal_key.data(), n - 8); + return (c <= static_cast(kValueTypeForSeek)); +} + +// Update the sequence number in the internal key +inline void UpdateInternalKey(char* internal_key, + const size_t internal_key_size, + uint64_t seq, ValueType t) { + assert(internal_key_size >= 8); + char* seqtype = internal_key + internal_key_size - 8; + uint64_t newval = (seq << 8) | t; + EncodeFixed64(seqtype, newval); +} + +// Get the sequence number from the internal key +inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { + const size_t n = internal_key.size(); + assert(n >= 8); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + return num >> 8; +} + + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& user_key, SequenceNumber sequence); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { return Slice(start_, end_ - start_); } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + + // Return the user key + Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +class IterKey { + public: + IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {} + + ~IterKey() { ResetBuffer(); } + + Slice GetKey() const { return Slice(key_, key_size_); } + + void Clear() { key_size_ = 0; } + + void SetUserKey(const Slice& user_key) { + size_t size = user_key.size(); + EnlargeBufferIfNeeded(size); + memcpy(key_, user_key.data(), size); + key_size_ = size; + } + + void SetInternalKey(const Slice& user_key, SequenceNumber s, + ValueType value_type = kValueTypeForSeek) { + size_t usize = user_key.size(); + EnlargeBufferIfNeeded(usize + sizeof(uint64_t)); + memcpy(key_, user_key.data(), usize); + EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type)); + key_size_ = usize + sizeof(uint64_t); + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type); + } + + private: + char* key_; + size_t buf_size_; + size_t key_size_; + char space_[32]; // Avoid allocation for short keys + + void ResetBuffer() { + if (key_ != nullptr && key_ != space_) { + delete[] key_; + } + key_ = space_; + buf_size_ = sizeof(buf_size_); + key_size_ = 0; + } + + // Enlarge the buffer size if needed based on key_size. + // By default, static allocated buffer is used. Once there is a key + // larger than the static allocated buffer, another buffer is dynamically + // allocated, until a larger key buffer is requested. In that case, we + // reallocate buffer and delete the old one. + void EnlargeBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + if (key_size > buf_size_) { + // Need to enlarge the buffer. + ResetBuffer(); + key_ = new char[key_size]; + buf_size_ = key_size; + } + } + + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; +}; + +class InternalKeySliceTransform : public SliceTransform { + public: + explicit InternalKeySliceTransform(const SliceTransform* transform) + : transform_(transform) {} + + virtual const char* Name() const { return transform_->Name(); } + + virtual Slice Transform(const Slice& src) const { + auto user_key = ExtractUserKey(src); + return transform_->Transform(user_key); + } + + virtual bool InDomain(const Slice& src) const { + auto user_key = ExtractUserKey(src); + return transform_->InDomain(user_key); + } + + virtual bool InRange(const Slice& dst) const { + auto user_key = ExtractUserKey(dst); + return transform_->InRange(user_key); + } + + const SliceTransform* user_prefix_extractor() const { return transform_; } + + private: + // Like comparator, InternalKeySliceTransform will not take care of the + // deletion of transform_ + const SliceTransform* const transform_; +}; + +} // namespace rocksdb diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc new file mode 100644 index 00000000..b520f3c4 --- /dev/null +++ b/db/dbformat_test.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest { }; + +TEST(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc new file mode 100644 index 00000000..14f0324c --- /dev/null +++ b/db/deletefile_test.cc @@ -0,0 +1,295 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include +#include +#include +#include + +namespace rocksdb { + +class DeleteFileTest { + public: + std::string dbname_; + Options options_; + DB* db_; + Env* env_; + int numlevels_; + + DeleteFileTest() { + db_ = nullptr; + env_ = Env::Default(); + options_.write_buffer_size = 1024*1024*1000; + options_.target_file_size_base = 1024*1024*1000; + options_.max_bytes_for_level_base = 1024*1024*1000; + options_.WAL_ttl_seconds = 300; // Used to test log files + options_.WAL_size_limit_MB = 1024; // Used to test log files + dbname_ = test::TmpDir() + "/deletefile_test"; + options_.wal_dir = dbname_ + "/wal_files"; + + // clean up all the files that might have been there before + std::vector old_files; + env_->GetChildren(dbname_, &old_files); + for (auto file : old_files) { + env_->DeleteFile(dbname_ + "/" + file); + } + env_->GetChildren(options_.wal_dir, &old_files); + for (auto file : old_files) { + env_->DeleteFile(options_.wal_dir + "/" + file); + } + + DestroyDB(dbname_, options_); + numlevels_ = 7; + ASSERT_OK(ReopenDB(true)); + } + + Status ReopenDB(bool create) { + delete db_; + if (create) { + DestroyDB(dbname_, options_); + } + db_ = nullptr; + options_.create_if_missing = create; + return DB::Open(options_, dbname_, &db_); + } + + void CloseDB() { + delete db_; + } + + void AddKeys(int numkeys, int startkey = 0) { + WriteOptions options; + options.sync = false; + ReadOptions roptions; + for (int i = startkey; i < (numkeys + startkey) ; i++) { + std::string temp = std::to_string(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + int numKeysInLevels( + std::vector &metadata, + std::vector *keysperlevel = nullptr) { + + if (keysperlevel != nullptr) { + keysperlevel->resize(numlevels_); + } + + int numKeys = 0; + for (size_t i = 0; i < metadata.size(); i++) { + int startkey = atoi(metadata[i].smallestkey.c_str()); + int endkey = atoi(metadata[i].largestkey.c_str()); + int numkeysinfile = (endkey - startkey + 1); + numKeys += numkeysinfile; + if (keysperlevel != nullptr) { + (*keysperlevel)[(int)metadata[i].level] += numkeysinfile; + } + fprintf(stderr, "level %d name %s smallest %s largest %s\n", + metadata[i].level, metadata[i].name.c_str(), + metadata[i].smallestkey.c_str(), + metadata[i].largestkey.c_str()); + } + return numKeys; + } + + void CreateTwoLevels() { + AddKeys(50000, 10000); + DBImpl* dbi = reinterpret_cast(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + + AddKeys(50000, 10000); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + } + + void CheckFileTypeCounts(std::string& dir, + int required_log, + int required_sst, + int required_manifest) { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(required_log, log_cnt); + ASSERT_EQ(required_sst, sst_cnt); + ASSERT_EQ(required_manifest, manifest_cnt); + } + +}; + +TEST(DeleteFileTest, AddKeysAndQueryLevels) { + CreateTwoLevels(); + std::vector metadata; + std::vector keysinlevel; + db_->GetLiveFilesMetaData(&metadata); + + std::string level1file = ""; + int level1keycount = 0; + std::string level2file = ""; + int level2keycount = 0; + int level1index = 0; + int level2index = 1; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 2) { + level1index = 1; + level2index = 0; + } + + level1file = metadata[level1index].name; + int startkey = atoi(metadata[level1index].smallestkey.c_str()); + int endkey = atoi(metadata[level1index].largestkey.c_str()); + level1keycount = (endkey - startkey + 1); + level2file = metadata[level2index].name; + startkey = atoi(metadata[level2index].smallestkey.c_str()); + endkey = atoi(metadata[level2index].largestkey.c_str()); + level2keycount = (endkey - startkey + 1); + + // COntrolled setup. Levels 1 and 2 should both have 50K files. + // This is a little fragile as it depends on the current + // compaction heuristics. + ASSERT_EQ(level1keycount, 50000); + ASSERT_EQ(level2keycount, 50000); + + Status status = db_->DeleteFile("0.sst"); + ASSERT_TRUE(status.IsInvalidArgument()); + + // intermediate level files cannot be deleted. + status = db_->DeleteFile(level1file); + ASSERT_TRUE(status.IsInvalidArgument()); + + // Lowest level file deletion should succeed. + ASSERT_OK(db_->DeleteFile(level2file)); + + CloseDB(); +} + +TEST(DeleteFileTest, PurgeObsoleteFilesTest) { + CreateTwoLevels(); + // there should be only one (empty) log file because CreateTwoLevels() + // flushes the memtables to disk + CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); + // 2 ssts, 1 manifest + CheckFileTypeCounts(dbname_, 0, 2, 1); + std::string first("0"), last("999999"); + Slice first_slice(first), last_slice(last); + db_->CompactRange(&first_slice, &last_slice, true, 2); + // 1 sst after compaction + CheckFileTypeCounts(dbname_, 0, 1, 1); + + // this time, we keep an iterator alive + ReopenDB(true); + Iterator *itr = 0; + CreateTwoLevels(); + itr = db_->NewIterator(ReadOptions()); + db_->CompactRange(&first_slice, &last_slice, true, 2); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); + + CloseDB(); +} + +TEST(DeleteFileTest, DeleteFileWithIterator) { + CreateTwoLevels(); + ReadOptions options; + Iterator* it = db_->NewIterator(options); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level2file = ""; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 1) { + level2file = metadata[1].name; + } else { + level2file = metadata[0].name; + } + + Status status = db_->DeleteFile(level2file); + fprintf(stdout, "Deletion status %s: %s\n", + level2file.c_str(), status.ToString().c_str()); + ASSERT_TRUE(status.ok()); + it->SeekToFirst(); + int numKeysIterated = 0; + while(it->Valid()) { + numKeysIterated++; + it->Next(); + } + ASSERT_EQ(numKeysIterated, 50000); + delete it; + CloseDB(); +} + +TEST(DeleteFileTest, DeleteLogFiles) { + AddKeys(10, 0); + VectorLogPtr logfiles; + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + // Take the last log file which is expected to be alive and try to delete it + // Should not succeed because live logs are not allowed to be deleted + std::unique_ptr alive_log = std::move(logfiles.back()); + ASSERT_EQ(alive_log->Type(), kAliveLogFile); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + fprintf(stdout, "Deleting alive log file %s\n", + alive_log->PathName().c_str()); + ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + logfiles.clear(); + + // Call Flush to bring about a new working log file and add more keys + // Call Flush again to flush out memtable and move alive log to archived log + // and try to delete the archived log file + FlushOptions fopts; + db_->Flush(fopts); + AddKeys(10, 0); + db_->Flush(fopts); + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + std::unique_ptr archived_log = std::move(logfiles.front()); + ASSERT_EQ(archived_log->Type(), kArchivedLogFile); + ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + + archived_log->PathName())); + fprintf(stdout, "Deleting archived log file %s\n", + archived_log->PathName().c_str()); + ASSERT_OK(db_->DeleteFile(archived_log->PathName())); + ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" + + archived_log->PathName())); + CloseDB(); +} + +} //namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} + diff --git a/db/filename.cc b/db/filename.cc new file mode 100644 index 00000000..cdbd1bc7 --- /dev/null +++ b/db/filename.cc @@ -0,0 +1,266 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include +#include +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "util/logging.h" + +namespace rocksdb { + +// Given a path, flatten the path name by replacing all chars not in +// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end. +// Return the number of chars stored in dest not including the trailing '\0'. +static int FlattenPath(const std::string& path, char* dest, int len) { + int write_idx = 0; + int i = 0; + int src_len = path.size(); + + while (i < src_len && write_idx < len - 1) { + if ((path[i] >= 'a' && path[i] <= 'z') || + (path[i] >= '0' && path[i] <= '9') || + (path[i] >= 'A' && path[i] <= 'Z') || + path[i] == '-' || + path[i] == '.' || + path[i] == '_'){ + dest[write_idx++] = path[i]; + } else { + if (i > 0) + dest[write_idx++] = '_'; + } + i++; + } + + dest[write_idx] = '\0'; + return write_idx; +} + +// A utility routine: write "data" to the named file and Sync() it. +extern Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname); + +static std::string MakeFileName(const std::string& name, uint64_t number, + const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "/%06llu.%s", + static_cast(number), + suffix); + return name + buf; +} + +std::string LogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "log"); +} + +std::string ArchivalDirectory(const std::string& dir) { + return dir + "/" + ARCHIVAL_DIR; +} +std::string ArchivedLogFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log"); +} + +std::string TableFileName(const std::string& name, uint64_t number) { + assert(number > 0); + return MakeFileName(name, number, "sst"); +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + assert(number > 0); + char buf[100]; + snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + static_cast(number)); + return dbname + buf; +} + +std::string CurrentFileName(const std::string& dbname) { + return dbname + "/CURRENT"; +} + +std::string LockFileName(const std::string& dbname) { + return dbname + "/LOCK"; +} + +std::string TempFileName(const std::string& dbname, uint64_t number) { + assert(number >= 0); + return MakeFileName(dbname, number, "dbtmp"); +} + +std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path, const std::string& log_dir) { + if (log_dir.empty()) + return dbname + "/LOG"; + + char flatten_db_path[256]; + FlattenPath(db_path, flatten_db_path, 256); + return log_dir + "/" + flatten_db_path + "_LOG"; +} + +// Return the name of the old info log file for "dbname". +std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path, const std::string& log_dir) { + char buf[50]; + snprintf(buf, sizeof(buf), "%llu", static_cast(ts)); + + if (log_dir.empty()) + return dbname + "/LOG.old." + buf; + + char flatten_db_path[256]; + FlattenPath(db_path, flatten_db_path, 256); + return log_dir + "/" + flatten_db_path + "_LOG.old." + buf; +} + +std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { + char buf[100]; + snprintf(buf, sizeof(buf), "/METADB-%llu", + static_cast(number)); + return dbname + buf; +} + +std::string IdentityFileName(const std::string& dbname) { + return dbname + "/IDENTITY"; +} + +// Owned filenames have the form: +// dbname/IDENTITY +// dbname/CURRENT +// dbname/LOCK +// dbname/LOG +// dbname/LOG.old.[0-9]+ +// dbname/MANIFEST-[0-9]+ +// dbname/[0-9]+.(log|sst) +// dbname/METADB-[0-9]+ +// Disregards / at the beginning +bool ParseFileName(const std::string& fname, + uint64_t* number, + FileType* type, + WalFileType* log_type) { + Slice rest(fname); + if (fname.length() > 1 && fname[0] == '/') { + rest.remove_prefix(1); + } + if (rest == "IDENTITY") { + *number = 0; + *type = kIdentityFile; + } else if (rest == "CURRENT") { + *number = 0; + *type = kCurrentFile; + } else if (rest == "LOCK") { + *number = 0; + *type = kDBLockFile; + } else if (rest == "LOG" || rest == "LOG.old") { + *number = 0; + *type = kInfoLogFile; + } else if (rest.starts_with("LOG.old.")) { + uint64_t ts_suffix; + // sizeof also counts the trailing '\0'. + rest.remove_prefix(sizeof("LOG.old.") - 1); + if (!ConsumeDecimalNumber(&rest, &ts_suffix)) { + return false; + } + *number = ts_suffix; + *type = kInfoLogFile; + } else if (rest.starts_with("MANIFEST-")) { + rest.remove_prefix(strlen("MANIFEST-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kDescriptorFile; + *number = num; + } else if (rest.starts_with("METADB-")) { + rest.remove_prefix(strlen("METADB-")); + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + if (!rest.empty()) { + return false; + } + *type = kMetaDatabase; + *number = num; + } else { + // Avoid strtoull() to keep filename format independent of the + // current locale + bool archive_dir_found = false; + if (rest.starts_with(ARCHIVAL_DIR)) { + if (rest.size() <= ARCHIVAL_DIR.size()) { + return false; + } + rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also + if (log_type) { + *log_type = kArchivedLogFile; + } + archive_dir_found = true; + } + uint64_t num; + if (!ConsumeDecimalNumber(&rest, &num)) { + return false; + } + Slice suffix = rest; + if (suffix == Slice(".log")) { + *type = kLogFile; + if (log_type && !archive_dir_found) { + *log_type = kAliveLogFile; + } + } else if (archive_dir_found) { + return false; // Archive dir can contain only log files + } else if (suffix == Slice(".sst")) { + *type = kTableFile; + } else if (suffix == Slice(".dbtmp")) { + *type = kTempFile; + } else { + return false; + } + *number = num; + } + return true; +} + +Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number) { + // Remove leading "dbname/" and add newline to manifest file name + std::string manifest = DescriptorFileName(dbname, descriptor_number); + Slice contents = manifest; + assert(contents.starts_with(dbname + "/")); + contents.remove_prefix(dbname.size() + 1); + std::string tmp = TempFileName(dbname, descriptor_number); + Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp); + if (s.ok()) { + s = env->RenameFile(tmp, CurrentFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +Status SetIdentityFile(Env* env, const std::string& dbname) { + std::string id = env->GenerateUniqueId(); + assert(!id.empty()); + // Reserve the filename dbname/000000.dbtmp for the temporary identity file + std::string tmp = TempFileName(dbname, 0); + Status s = WriteStringToFileSync(env, id, tmp); + if (s.ok()) { + s = env->RenameFile(tmp, IdentityFileName(dbname)); + } + if (!s.ok()) { + env->DeleteFile(tmp); + } + return s; +} + +} // namespace rocksdb diff --git a/db/filename.h b/db/filename.h new file mode 100644 index 00000000..8e55f113 --- /dev/null +++ b/db/filename.h @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// File names used by DB code + +#pragma once +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" +#include "port/port.h" + +namespace rocksdb { + +class Env; + +enum FileType { + kLogFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile +}; + +// Return the name of the log file with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string LogFileName(const std::string& dbname, uint64_t number); + +static const std::string ARCHIVAL_DIR = "archive"; + +extern std::string ArchivalDirectory(const std::string& dbname); + +// Return the name of the archived log file with the specified number +// in the db named by "dbname". The result will be prefixed with "dbname". +extern std::string ArchivedLogFileName(const std::string& dbname, + uint64_t num); + +// Return the name of the sstable with the specified number +// in the db named by "dbname". The result will be prefixed with +// "dbname". +extern std::string TableFileName(const std::string& dbname, uint64_t number); + +// Return the name of the descriptor file for the db named by +// "dbname" and the specified incarnation number. The result will be +// prefixed with "dbname". +extern std::string DescriptorFileName(const std::string& dbname, + uint64_t number); + +// Return the name of the current file. This file contains the name +// of the current manifest file. The result will be prefixed with +// "dbname". +extern std::string CurrentFileName(const std::string& dbname); + +// Return the name of the lock file for the db named by +// "dbname". The result will be prefixed with "dbname". +extern std::string LockFileName(const std::string& dbname); + +// Return the name of a temporary file owned by the db named "dbname". +// The result will be prefixed with "dbname". +extern std::string TempFileName(const std::string& dbname, uint64_t number); + +// Return the name of the info log file for "dbname". +extern std::string InfoLogFileName(const std::string& dbname, + const std::string& db_path="", const std::string& log_dir=""); + +// Return the name of the old info log file for "dbname". +extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, + const std::string& db_path="", const std::string& log_dir=""); + +// Return the name to use for a metadatabase. The result will be prefixed with +// "dbname". +extern std::string MetaDatabaseName(const std::string& dbname, + uint64_t number); + +// Return the name of the Identity file which stores a unique number for the db +// that will get regenerated if the db loses all its data and is recreated fresh +// either from a backup-image or empty +extern std::string IdentityFileName(const std::string& dbname); + +// If filename is a rocksdb file, store the type of the file in *type. +// The number encoded in the filename is stored in *number. If the +// filename was successfully parsed, returns true. Else return false. +extern bool ParseFileName(const std::string& filename, + uint64_t* number, + FileType* type, + WalFileType* log_type = nullptr); + +// Make the CURRENT file point to the descriptor file with the +// specified number. +extern Status SetCurrentFile(Env* env, const std::string& dbname, + uint64_t descriptor_number); + +// Make the IDENTITY file for the db +extern Status SetIdentityFile(Env* env, const std::string& dbname); + +} // namespace rocksdb diff --git a/db/filename_test.cc b/db/filename_test.cc new file mode 100644 index 00000000..0baa7fda --- /dev/null +++ b/db/filename_test.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/filename.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +class FileNameTest { }; + +TEST(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + FileType type; + } cases[] = { + { "100.log", 100, kLogFile }, + { "0.log", 0, kLogFile }, + { "0.sst", 0, kTableFile }, + { "CURRENT", 0, kCurrentFile }, + { "LOCK", 0, kDBLockFile }, + { "MANIFEST-2", 2, kDescriptorFile }, + { "MANIFEST-7", 7, kDescriptorFile }, + { "METADB-2", 2, kMetaDatabase }, + { "METADB-7", 7, kMetaDatabase }, + { "LOG", 0, kInfoLogFile }, + { "LOG.old", 0, kInfoLogFile }, + { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, + }; + for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + std::string f = cases[i].fname; + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; + ASSERT_EQ(cases[i].type, type) << f; + ASSERT_EQ(cases[i].number, number) << f; + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "META", + "METADB", + "METADB-", + "XMETADB-3", + "METADB-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop" + }; + for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; + }; +} + +TEST(FileNameTest, Construction) { + uint64_t number; + FileType type; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(192U, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName("bar", 200); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(200U, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(999U, number); + ASSERT_EQ(kTempFile, type); + + fname = MetaDatabaseName("met", 100); + ASSERT_EQ("met/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kMetaDatabase, type); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/internal_stats.cc b/db/internal_stats.cc new file mode 100644 index 00000000..4cc04996 --- /dev/null +++ b/db/internal_stats.cc @@ -0,0 +1,361 @@ +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/internal_stats.h" +#include "db/db_impl.h" +#include "db/memtable_list.h" + +#include + +namespace rocksdb { + +DBPropertyType GetPropertyType(const Slice& property) { + Slice in = property; + Slice prefix("rocksdb."); + if (!in.starts_with(prefix)) return kUnknown; + in.remove_prefix(prefix.size()); + + if (in.starts_with("num-files-at-level")) { + return kNumFilesAtLevel; + } else if (in == "levelstats") { + return kLevelStats; + } else if (in == "stats") { + return kStats; + } else if (in == "sstables") { + return kSsTables; + } else if (in == "num-immutable-mem-table") { + return kNumImmutableMemTable; + } else if (in == "mem-table-flush-pending") { + return kMemtableFlushPending; + } else if (in == "compaction-pending") { + return kCompactionPending; + } else if (in == "background-errors") { + return kBackgroundErrors; + } else if (in == "cur-size-active-mem-table") { + return kCurSizeActiveMemTable; + } + return kUnknown; +} + +bool InternalStats::GetProperty(DBPropertyType property_type, + const Slice& property, std::string* value, + DBImpl* db) { + VersionSet* version_set = db->versions_.get(); + Version* current = version_set->current(); + const MemTableList& imm = db->imm_; + Slice in = property; + + switch (property_type) { + case kNumFilesAtLevel: { + in.remove_prefix(strlen("rocksdb.num-files-at-level")); + uint64_t level; + bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); + if (!ok || (int)level >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + current->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } + } + case kLevelStats: { + char buf[1000]; + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); + + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + current->NumLevelFiles(level), + current->NumLevelBytes(level) / 1048576.0); + value->append(buf); + } + return true; + } + case kStats: { + char buf[1000]; + + uint64_t wal_bytes = 0; + uint64_t wal_synced = 0; + uint64_t user_bytes_written = 0; + uint64_t write_other = 0; + uint64_t write_self = 0; + uint64_t write_with_wal = 0; + uint64_t total_bytes_written = 0; + uint64_t total_bytes_read = 0; + uint64_t micros_up = env_->NowMicros() - started_at_; + // Add "+1" to make sure seconds_up is > 0 and avoid NaN later + double seconds_up = (micros_up + 1) / 1000000.0; + uint64_t total_slowdown = 0; + uint64_t total_slowdown_count = 0; + uint64_t interval_bytes_written = 0; + uint64_t interval_bytes_read = 0; + uint64_t interval_bytes_new = 0; + double interval_seconds_up = 0; + + if (statistics_) { + wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES); + wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED); + user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN); + write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER); + write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF); + write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL); + } + + // Pardon the long line but I think it is easier to read this way. + snprintf( + buf, sizeof(buf), + " Compactions\n" + "Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) " + " " + "Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn " + "Rnp1 " + " Wnp1 NewW Count msComp msStall Ln-stall Stall-cnt\n" + "--------------------------------------------------------------------" + "--" + "--------------------------------------------------------------------" + "--" + "----------------------------------------------------------------\n"); + value->append(buf); + for (int level = 0; level < number_levels_; level++) { + int files = current->NumLevelFiles(level); + if (compaction_stats_[level].micros > 0 || files > 0) { + int64_t bytes_read = compaction_stats_[level].bytes_readn + + compaction_stats_[level].bytes_readnp1; + int64_t bytes_new = compaction_stats_[level].bytes_written - + compaction_stats_[level].bytes_readnp1; + double amplify = + (compaction_stats_[level].bytes_readn == 0) + ? 0.0 + : (compaction_stats_[level].bytes_written + + compaction_stats_[level].bytes_readnp1 + + compaction_stats_[level].bytes_readn) / + (double)compaction_stats_[level].bytes_readn; + + total_bytes_read += bytes_read; + total_bytes_written += compaction_stats_[level].bytes_written; + + uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] + + stall_counts_[LEVEL0_NUM_FILES] + + stall_counts_[MEMTABLE_COMPACTION]) + : stall_leveln_slowdown_count_[level]; + + double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] + + stall_micros_[LEVEL0_NUM_FILES] + + stall_micros_[MEMTABLE_COMPACTION]) + : stall_leveln_slowdown_[level]; + + snprintf(buf, sizeof(buf), + "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f " + "%10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %8d %9.1f %9.1f " + "%9lu\n", + level, files, current->NumLevelBytes(level) / 1048576.0, + current->NumLevelBytes(level) / + version_set->MaxBytesForLevel(level), + compaction_stats_[level].micros / 1e6, + bytes_read / 1048576.0, + compaction_stats_[level].bytes_written / 1048576.0, + compaction_stats_[level].bytes_readn / 1048576.0, + compaction_stats_[level].bytes_readnp1 / 1048576.0, + bytes_new / 1048576.0, amplify, + // +1 to avoid division by 0 + (bytes_read / 1048576.0) / + ((compaction_stats_[level].micros + 1) / 1000000.0), + (compaction_stats_[level].bytes_written / 1048576.0) / + ((compaction_stats_[level].micros + 1) / 1000000.0), + compaction_stats_[level].files_in_leveln, + compaction_stats_[level].files_in_levelnp1, + compaction_stats_[level].files_out_levelnp1, + compaction_stats_[level].files_out_levelnp1 - + compaction_stats_[level].files_in_levelnp1, + compaction_stats_[level].count, + (int)((double)compaction_stats_[level].micros / 1000.0 / + (compaction_stats_[level].count + 1)), + (double)stall_us / 1000.0 / (stalls + 1), + stall_us / 1000000.0, (unsigned long)stalls); + total_slowdown += stall_leveln_slowdown_[level]; + total_slowdown_count += stall_leveln_slowdown_count_[level]; + value->append(buf); + } + } + + interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_; + interval_bytes_read = + total_bytes_read - last_stats_.compaction_bytes_read_; + interval_bytes_written = + total_bytes_written - last_stats_.compaction_bytes_written_; + interval_seconds_up = seconds_up - last_stats_.seconds_up_; + + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Writes cumulative: %llu total, %llu batches, " + "%.1f per batch, %.2f ingest GB\n", + (unsigned long long)(write_other + write_self), + (unsigned long long)write_self, + (write_other + write_self) / (double)(write_self + 1), + user_bytes_written / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "WAL cumulative: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f GB written\n", + (unsigned long long)write_with_wal, + (unsigned long long)wal_synced, + write_with_wal / (double)(wal_synced + 1), + wal_bytes / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO cumulative (GB): " + "%.2f new, %.2f read, %.2f write, %.2f read+write\n", + user_bytes_written / (1048576.0 * 1024), + total_bytes_read / (1048576.0 * 1024), + total_bytes_written / (1048576.0 * 1024), + (total_bytes_read + total_bytes_written) / (1048576.0 * 1024)); + value->append(buf); + + snprintf( + buf, sizeof(buf), + "Compaction IO cumulative (MB/sec): " + "%.1f new, %.1f read, %.1f write, %.1f read+write\n", + user_bytes_written / 1048576.0 / seconds_up, + total_bytes_read / 1048576.0 / seconds_up, + total_bytes_written / 1048576.0 / seconds_up, + (total_bytes_read + total_bytes_written) / 1048576.0 / seconds_up); + value->append(buf); + + // +1 to avoid divide by 0 and NaN + snprintf( + buf, sizeof(buf), + "Amplification cumulative: %.1f write, %.1f compaction\n", + (double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1), + (double)(total_bytes_written + total_bytes_read + wal_bytes) / + (user_bytes_written + 1)); + value->append(buf); + + uint64_t interval_write_other = write_other - last_stats_.write_other_; + uint64_t interval_write_self = write_self - last_stats_.write_self_; + + snprintf(buf, sizeof(buf), + "Writes interval: %llu total, %llu batches, " + "%.1f per batch, %.1f ingest MB\n", + (unsigned long long)(interval_write_other + interval_write_self), + (unsigned long long)interval_write_self, + (double)(interval_write_other + interval_write_self) / + (interval_write_self + 1), + (user_bytes_written - last_stats_.ingest_bytes_) / 1048576.0); + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - last_stats_.write_with_wal_; + + uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_; + uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_; + + snprintf(buf, sizeof(buf), + "WAL interval: %llu WAL writes, %llu WAL syncs, " + "%.2f writes per sync, %.2f MB written\n", + (unsigned long long)interval_write_with_wal, + (unsigned long long)interval_wal_synced, + interval_write_with_wal / (double)(interval_wal_synced + 1), + interval_wal_bytes / (1048576.0 * 1024)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO interval (MB): " + "%.2f new, %.2f read, %.2f write, %.2f read+write\n", + interval_bytes_new / 1048576.0, interval_bytes_read / 1048576.0, + interval_bytes_written / 1048576.0, + (interval_bytes_read + interval_bytes_written) / 1048576.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Compaction IO interval (MB/sec): " + "%.1f new, %.1f read, %.1f write, %.1f read+write\n", + interval_bytes_new / 1048576.0 / interval_seconds_up, + interval_bytes_read / 1048576.0 / interval_seconds_up, + interval_bytes_written / 1048576.0 / interval_seconds_up, + (interval_bytes_read + interval_bytes_written) / 1048576.0 / + interval_seconds_up); + value->append(buf); + + // +1 to avoid divide by 0 and NaN + snprintf( + buf, sizeof(buf), + "Amplification interval: %.1f write, %.1f compaction\n", + (double)(interval_bytes_written + wal_bytes) / + (interval_bytes_new + 1), + (double)(interval_bytes_written + interval_bytes_read + wal_bytes) / + (interval_bytes_new + 1)); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, " + "%.3f memtable_compaction, %.3f leveln_slowdown\n", + stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0, + stall_micros_[LEVEL0_NUM_FILES] / 1000000.0, + stall_micros_[MEMTABLE_COMPACTION] / 1000000.0, + total_slowdown / 1000000.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, " + "%lu memtable_compaction, %lu leveln_slowdown\n", + (unsigned long)stall_counts_[LEVEL0_SLOWDOWN], + (unsigned long)stall_counts_[LEVEL0_NUM_FILES], + (unsigned long)stall_counts_[MEMTABLE_COMPACTION], + (unsigned long)total_slowdown_count); + value->append(buf); + + last_stats_.compaction_bytes_read_ = total_bytes_read; + last_stats_.compaction_bytes_written_ = total_bytes_written; + last_stats_.ingest_bytes_ = user_bytes_written; + last_stats_.seconds_up_ = seconds_up; + last_stats_.wal_bytes_ = wal_bytes; + last_stats_.wal_synced_ = wal_synced; + last_stats_.write_with_wal_ = write_with_wal; + last_stats_.write_other_ = write_other; + last_stats_.write_self_ = write_self; + + return true; + } + case kSsTables: + *value = current->DebugString(); + return true; + case kNumImmutableMemTable: + *value = std::to_string(imm.size()); + return true; + case kMemtableFlushPending: + // Return number of mem tables that are ready to flush (made immutable) + *value = std::to_string(imm.IsFlushPending() ? 1 : 0); + return true; + case kCompactionPending: + // 1 if the system already determines at least one compacdtion is needed. + // 0 otherwise, + *value = std::to_string(current->NeedsCompaction() ? 1 : 0); + return true; + case kBackgroundErrors: + // Accumulated number of errors in background flushes or compactions. + *value = std::to_string(GetBackgroundErrorCount()); + return true; + case kCurSizeActiveMemTable: + // Current size of the active memtable + *value = std::to_string(db->mem_->ApproximateMemoryUsage()); + return true; + default: + return false; + } +} + +} // namespace rocksdb diff --git a/db/internal_stats.h b/db/internal_stats.h new file mode 100644 index 00000000..e140e728 --- /dev/null +++ b/db/internal_stats.h @@ -0,0 +1,182 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#pragma once +#include "rocksdb/statistics.h" +#include "util/statistics.h" +#include "db/version_set.h" + +#include +#include + +namespace rocksdb { + +class MemTableList; +class DBImpl; + +enum DBPropertyType { + kNumFilesAtLevel, // Number of files at a specific level + kLevelStats, // Return number of files and total sizes of each level + kStats, // Return general statitistics of DB + kSsTables, // Return a human readable string of current SST files + kNumImmutableMemTable, // Return number of immutable mem tables + kMemtableFlushPending, // Return 1 if mem table flushing is pending, + // otherwise + // 0. + kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. + kBackgroundErrors, // Return accumulated background errors encountered. + kCurSizeActiveMemTable, // Return current size of the active memtable + kUnknown, +}; + +extern DBPropertyType GetPropertyType(const Slice& property); + +class InternalStats { + public: + enum WriteStallType { + LEVEL0_SLOWDOWN, + MEMTABLE_COMPACTION, + LEVEL0_NUM_FILES, + WRITE_STALLS_ENUM_MAX, + }; + + InternalStats(int num_levels, Env* env, Statistics* statistics) + : compaction_stats_(num_levels), + stall_micros_(WRITE_STALLS_ENUM_MAX, 0), + stall_counts_(WRITE_STALLS_ENUM_MAX, 0), + stall_leveln_slowdown_(num_levels, 0), + stall_leveln_slowdown_count_(num_levels, 0), + bg_error_count_(0), + number_levels_(num_levels), + statistics_(statistics), + env_(env), + started_at_(env->NowMicros()) {} + + // Per level compaction stats. compaction_stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + uint64_t micros; + + // Bytes read from level N during compaction between levels N and N+1 + int64_t bytes_readn; + + // Bytes read from level N+1 during compaction between levels N and N+1 + int64_t bytes_readnp1; + + // Total bytes written during compaction between levels N and N+1 + int64_t bytes_written; + + // Files read from level N during compaction between levels N and N+1 + int files_in_leveln; + + // Files read from level N+1 during compaction between levels N and N+1 + int files_in_levelnp1; + + // Files written during compaction between levels N and N+1 + int files_out_levelnp1; + + // Number of compactions done + int count; + + CompactionStats() + : micros(0), + bytes_readn(0), + bytes_readnp1(0), + bytes_written(0), + files_in_leveln(0), + files_in_levelnp1(0), + files_out_levelnp1(0), + count(0) {} + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->bytes_readn += c.bytes_readn; + this->bytes_readnp1 += c.bytes_readnp1; + this->bytes_written += c.bytes_written; + this->files_in_leveln += c.files_in_leveln; + this->files_in_levelnp1 += c.files_in_levelnp1; + this->files_out_levelnp1 += c.files_out_levelnp1; + this->count += 1; + } + }; + + void AddCompactionStats(int level, const CompactionStats& stats) { + compaction_stats_[level].Add(stats); + } + + void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) { + stall_micros_[write_stall_type] += micros; + stall_counts_[write_stall_type]++; + } + + void RecordLevelNSlowdown(int level, uint64_t micros) { + stall_leveln_slowdown_[level] += micros; + stall_leveln_slowdown_count_[level] += micros; + } + + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } + + uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } + + bool GetProperty(DBPropertyType property_type, const Slice& property, + std::string* value, DBImpl* db); + + private: + std::vector compaction_stats_; + + // Used to compute per-interval statistics + struct StatsSnapshot { + uint64_t compaction_bytes_read_; // Bytes read by compaction + uint64_t compaction_bytes_written_; // Bytes written by compaction + uint64_t ingest_bytes_; // Bytes written by user + uint64_t wal_bytes_; // Bytes written to WAL + uint64_t wal_synced_; // Number of times WAL is synced + uint64_t write_with_wal_; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other_; + uint64_t write_self_; + double seconds_up_; + + StatsSnapshot() + : compaction_bytes_read_(0), + compaction_bytes_written_(0), + ingest_bytes_(0), + wal_bytes_(0), + wal_synced_(0), + write_with_wal_(0), + write_other_(0), + write_self_(0), + seconds_up_(0) {} + }; + + // Counters from the previous time per-interval stats were computed + StatsSnapshot last_stats_; + + // These count the number of microseconds for which MakeRoomForWrite stalls. + std::vector stall_micros_; + std::vector stall_counts_; + std::vector stall_leveln_slowdown_; + std::vector stall_leveln_slowdown_count_; + + // Total number of background errors encountered. Every time a flush task + // or compaction task fails, this counter is incremented. The failure can + // be caused by any possible reason, including file system errors, out of + // resources, or input file corruption. Failing when retrying the same flush + // or compaction will cause the counter to increase too. + uint64_t bg_error_count_; + + int number_levels_; + Statistics* statistics_; + Env* env_; + uint64_t started_at_; +}; + +} // namespace rocksdb diff --git a/db/log_format.h b/db/log_format.h new file mode 100644 index 00000000..919c087e --- /dev/null +++ b/db/log_format.h @@ -0,0 +1,35 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#pragma once +namespace rocksdb { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4 +}; +static const int kMaxRecordType = kLastType; + +static const unsigned int kBlockSize = 32768; + +// Header is checksum (4 bytes), type (1 byte), length (2 bytes). +static const int kHeaderSize = 4 + 1 + 2; + +} // namespace log +} // namespace rocksdb diff --git a/db/log_reader.cc b/db/log_reader.cc new file mode 100644 index 00000000..be1fb8ce --- /dev/null +++ b/db/log_reader.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t initial_offset) + : file_(std::move(file)), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false), + read_error_(false), + eof_offset_(0), + last_record_offset_(0), + end_of_buffer_offset_(0), + initial_offset_(initial_offset) { +} + +Reader::~Reader() { + delete[] backing_store_; +} + +bool Reader::SkipToInitialBlock() { + size_t offset_in_block = initial_offset_ % kBlockSize; + uint64_t block_start_location = initial_offset_ - offset_in_block; + + // Don't search a block if we'd be in the trailer + if (offset_in_block > kBlockSize - 6) { + offset_in_block = 0; + block_start_location += kBlockSize; + } + + end_of_buffer_offset_ = block_start_location; + + // Skip to start of first block that can contain the initial record + if (block_start_location > 0) { + Status skip_status = file_->Skip(block_start_location); + if (!skip_status.ok()) { + ReportDrop(block_start_location, skip_status); + return false; + } + } + + return true; +} + +bool Reader::ReadRecord(Slice* record, std::string* scratch) { + if (last_record_offset_ < initial_offset_) { + if (!SkipToInitialBlock()) { + return false; + } + } + + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; + + Slice fragment; + while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + const unsigned int record_type = ReadPhysicalRecord(&fragment); + switch (record_type) { + case kFullType: + if (in_fragmented_record) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(1)"); + } + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + *record = fragment; + last_record_offset_ = prospective_record_offset; + return true; + + case kFirstType: + if (in_fragmented_record) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + if (scratch->empty()) { + in_fragmented_record = false; + } else { + ReportCorruption(scratch->size(), "partial record without end(2)"); + } + } + prospective_record_offset = physical_record_offset; + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + return true; + } + break; + + case kEof: + if (in_fragmented_record) { + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + + case kBadRecord: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); + ReportCorruption( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + buf); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + } + return false; +} + +uint64_t Reader::LastRecordOffset() { + return last_record_offset_; +} + +void Reader::UnmarkEOF() { + if (read_error_) { + return; + } + + eof_ = false; + + if (eof_offset_ == 0) { + return; + } + + // If the EOF was in the middle of a block (a partial block was read) we have + // to read the rest of the block as ReadPhysicalRecord can only read full + // blocks and expects the file position indicator to be aligned to the start + // of a block. + // + // consumed_bytes + buffer_size() + remaining == kBlockSize + + size_t consumed_bytes = eof_offset_ - buffer_.size(); + size_t remaining = kBlockSize - eof_offset_; + + // backing_store_ is used to concatenate what is left in buffer_ and + // the remainder of the block. If buffer_ already uses backing_store_, + // we just append the new data. + if (buffer_.data() != backing_store_ + consumed_bytes) { + // Buffer_ does not use backing_store_ for storage. + // Copy what is left in buffer_ to backing_store. + memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size()); + } + + Slice read_buffer; + Status status = file_->Read(remaining, &read_buffer, + backing_store_ + eof_offset_); + + size_t added = read_buffer.size(); + end_of_buffer_offset_ += added; + + if (!status.ok()) { + if (added > 0) { + ReportDrop(added, status); + } + + read_error_ = true; + return; + } + + if (read_buffer.data() != backing_store_ + eof_offset_) { + // Read did not write to backing_store_ + memmove(backing_store_ + eof_offset_, read_buffer.data(), + read_buffer.size()); + } + + buffer_ = Slice(backing_store_ + consumed_bytes, + eof_offset_ + added - consumed_bytes); + + if (added < remaining) { + eof_ = true; + eof_offset_ += added; + } else { + eof_offset_ = 0; + } +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != nullptr && + end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { + reporter_->Corruption(bytes, reason); + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result) { + while (true) { + if (buffer_.size() < (size_t)kHeaderSize) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + return kEof; + } else if (buffer_.size() < (size_t)kBlockSize) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + continue; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Instead of considering this an error, + // just report EOF. + buffer_.clear(); + return kEof; + } + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + if (kHeaderSize + length > buffer_.size()) { + size_t drop_size = buffer_.size(); + buffer_.clear(); + if (!eof_) { + ReportCorruption(drop_size, "bad record length"); + return kBadRecord; + } + // If the end of the file has been reached without reading |length| bytes + // of payload, assume the writer died in the middle of writing the record. + // Don't report a corruption. + return kEof; + } + + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + // NOTE: this should never happen in DB written by new RocksDB versions, + // since we turn off mmap writes to manifest and log files + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + size_t drop_size = buffer_.size(); + buffer_.clear(); + ReportCorruption(drop_size, "checksum mismatch"); + return kBadRecord; + } + } + + buffer_.remove_prefix(kHeaderSize + length); + + // Skip physical record that started before initial_offset_ + if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < + initial_offset_) { + result->clear(); + return kBadRecord; + } + + *result = Slice(header + kHeaderSize, length); + return type; + } +} + +} // namespace log +} // namespace rocksdb diff --git a/db/log_reader.h b/db/log_reader.h new file mode 100644 index 00000000..81d334da --- /dev/null +++ b/db/log_reader.h @@ -0,0 +1,130 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class SequentialFile; +using std::unique_ptr; + +namespace log { + +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + // + // The Reader will start reading at the first record located at physical + // position >= initial_offset within the file. + Reader(unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t initial_offset); + + ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + bool ReadRecord(Slice* record, std::string* scratch); + + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + + // returns true if the reader has encountered an eof condition. + bool IsEOF() { + return eof_; + } + + // when we know more data has been written to the file. we can use this + // function to force the reader to look again in the file. + // Also aligns the file position indicator to the start of the next block + // by reading the rest of the data from the EOF position to the end of the + // block that was partially read. + void UnmarkEOF(); + + SequentialFile* file() { return file_.get(); } + + private: + const unique_ptr file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + bool read_error_; // Error occurred while reading from file + + // Offset of the file position indicator within the last block when an + // EOF was detected. + size_t eof_offset_; + + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // Offset at which to start looking for the first record to return + uint64_t const initial_offset_; + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + // * The record is below constructor's initial_offset (No drop is reported) + kBadRecord = kMaxRecordType + 2 + }; + + // Skips all blocks that are completely before "initial_offset_". + // + // Returns true on success. Handles reporting. + bool SkipToInitialBlock(); + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); + + // No copying allowed + Reader(const Reader&); + void operator=(const Reader&); +}; + +} // namespace log +} // namespace rocksdb diff --git a/db/log_test.cc b/db/log_test.cc new file mode 100644 index 00000000..6577a6a9 --- /dev/null +++ b/db/log_test.cc @@ -0,0 +1,689 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +class LogTest { + private: + class StringDest : public WritableFile { + public: + std::string contents_; + + explicit StringDest(Slice& reader_contents) : + WritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) { + reader_contents_ = Slice(contents_.data(), 0); + }; + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { + ASSERT_TRUE(reader_contents_.size() <= last_flush_); + size_t offset = last_flush_ - reader_contents_.size(); + reader_contents_ = Slice( + contents_.data() + offset, + contents_.size() - offset); + last_flush_ = contents_.size(); + + return Status::OK(); + } + virtual Status Sync() { return Status::OK(); } + virtual Status Append(const Slice& slice) { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + void Drop(size_t bytes) { + contents_.resize(contents_.size() - bytes); + reader_contents_ = Slice( + reader_contents_.data(), reader_contents_.size() - bytes); + last_flush_ = contents_.size(); + } + + private: + Slice& reader_contents_; + size_t last_flush_; + }; + + class StringSource : public SequentialFile { + public: + Slice& contents_; + bool force_error_; + size_t force_error_position_; + bool force_eof_; + size_t force_eof_position_; + bool returned_partial_; + explicit StringSource(Slice& contents) : + contents_(contents), + force_error_(false), + force_error_position_(0), + force_eof_(false), + force_eof_position_(0), + returned_partial_(false) { } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + + if (force_error_) { + if (force_error_position_ >= n) { + force_error_position_ -= n; + } else { + *result = Slice(contents_.data(), force_error_position_); + contents_.remove_prefix(force_error_position_); + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + + if (force_eof_) { + if (force_eof_position_ >= n) { + force_eof_position_ -= n; + } else { + force_eof_ = false; + n = force_eof_position_; + returned_partial_ = true; + } + } + + // By using scratch we ensure that caller has control over the + // lifetime of result.data() + memcpy(scratch, contents_.data(), n); + *result = Slice(scratch, n); + + contents_.remove_prefix(n); + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + if (n > contents_.size()) { + contents_.clear(); + return Status::NotFound("in-memory file skipepd past end"); + } + + contents_.remove_prefix(n); + + return Status::OK(); + } + }; + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + virtual void Corruption(size_t bytes, const Status& status) { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + std::string& dest_contents() { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + return dest->contents_; + } + + const std::string& dest_contents() const { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + return dest->contents_; + } + + void reset_source_contents() { + auto src = dynamic_cast(reader_.file()); + assert(src); + src->contents_ = dest_contents(); + } + + Slice reader_contents_; + unique_ptr dest_holder_; + unique_ptr source_holder_; + ReportCollector report_; + Writer writer_; + Reader reader_; + + // Record metadata for testing initial offset functionality + static size_t initial_offset_record_sizes_[]; + static uint64_t initial_offset_last_record_offsets_[]; + + public: + LogTest() : reader_contents_(), + dest_holder_(new StringDest(reader_contents_)), + source_holder_(new StringSource(reader_contents_)), + writer_(std::move(dest_holder_)), + reader_(std::move(source_holder_), &report_, true/*checksum*/, + 0/*initial_offset*/) { + } + + void Write(const std::string& msg) { + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_contents().size(); + } + + std::string Read() { + std::string scratch; + Slice record; + if (reader_.ReadRecord(&record, &scratch)) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, int delta) { + dest_contents()[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_contents()[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + auto dest = dynamic_cast(writer_.file()); + assert(dest); + dest->Drop(bytes); + } + + void FixChecksum(int header_offset, int len) { + // Compute crc of type/len/data + uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_contents()[header_offset], crc); + } + + void ForceError(size_t position = 0) { + auto src = dynamic_cast(reader_.file()); + src->force_error_ = true; + src->force_error_position_ = position; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + std::string ReportMessage() const { + return report_.message_; + } + + void ForceEOF(size_t position = 0) { + auto src = dynamic_cast(reader_.file()); + src->force_eof_ = true; + src->force_eof_position_ = position; + } + + void UnmarkEOF() { + auto src = dynamic_cast(reader_.file()); + src->returned_partial_ = false; + reader_.UnmarkEOF(); + } + + bool IsEOF() { + return reader_.IsEOF(); + } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } + + void WriteInitialOffsetLog() { + for (int i = 0; i < 4; i++) { + std::string record(initial_offset_record_sizes_[i], + static_cast('a' + i)); + Write(record); + } + } + + void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { + WriteInitialOffsetLog(); + unique_ptr source(new StringSource(reader_contents_)); + unique_ptr offset_reader( + new Reader(std::move(source), &report_, true/*checksum*/, + WrittenBytes() + offset_past_end)); + Slice record; + std::string scratch; + ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); + } + + void CheckInitialOffsetRecord(uint64_t initial_offset, + int expected_record_offset) { + WriteInitialOffsetLog(); + unique_ptr source(new StringSource(reader_contents_)); + unique_ptr offset_reader( + new Reader(std::move(source), &report_, true/*checksum*/, + initial_offset)); + Slice record; + std::string scratch; + ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); + ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], + record.size()); + ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], + offset_reader->LastRecordOffset()); + ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); + } + +}; + +size_t LogTest::initial_offset_record_sizes_[] = + {10000, // Two sizable records in first block + 10000, + 2 * log::kBlockSize - 1000, // Span three blocks + 1}; + +uint64_t LogTest::initial_offset_last_record_offsets_[] = + {0, + kHeaderSize + 10000, + 2 * (kHeaderSize + 10000), + 2 * (kHeaderSize + 10000) + + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; + + +TEST(LogTest, Empty) { + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, MarginalTrailer2) { + // Make a trailer that is exactly the same length as an empty record. + const int n = kBlockSize - 2*kHeaderSize; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, ShortTrailer) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, AlignedEof) { + const int n = kBlockSize - 2*kHeaderSize + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST(LogTest, TruncatedTrailingRecordIsIgnored) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + // Truncated last record is ignored, not treated as an error + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, BadLength) { + const int kPayloadSize = kBlockSize - kHeaderSize; + Write(BigString("bar", kPayloadSize)); + Write("foo"); + // Least significant size byte is stored in header[4]. + IncrementByte(4, 1); + ASSERT_EQ("foo", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); +} + +TEST(LogTest, BadLengthAtEndIsIgnored) { + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST(LogTest, ChecksumMismatch) { + Write("foo"); + IncrementByte(0, 10); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(10U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); +} + +TEST(LogTest, UnexpectedMiddleType) { + Write("foo"); + SetByte(6, kMiddleType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedLastType) { + Write("foo"); + SetByte(6, kLastType); + FixChecksum(0, 3); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + SetByte(6, kFirstType); + FixChecksum(0, 3); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST(LogTest, MissingLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Remove the LAST block, including header. + ShrinkSize(14); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST(LogTest, PartialLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Cause a bad record length in the LAST block. + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + const unsigned int dropped = DroppedBytes(); + ASSERT_LE(dropped, 2*kBlockSize + 100); + ASSERT_GE(dropped, 2*kBlockSize); +} + +TEST(LogTest, ReadStart) { + CheckInitialOffsetRecord(0, 0); +} + +TEST(LogTest, ReadSecondOneOff) { + CheckInitialOffsetRecord(1, 1); +} + +TEST(LogTest, ReadSecondTenThousand) { + CheckInitialOffsetRecord(10000, 1); +} + +TEST(LogTest, ReadSecondStart) { + CheckInitialOffsetRecord(10007, 1); +} + +TEST(LogTest, ReadThirdOneOff) { + CheckInitialOffsetRecord(10008, 2); +} + +TEST(LogTest, ReadThirdStart) { + CheckInitialOffsetRecord(20014, 2); +} + +TEST(LogTest, ReadFourthOneOff) { + CheckInitialOffsetRecord(20015, 3); +} + +TEST(LogTest, ReadFourthFirstBlockTrailer) { + CheckInitialOffsetRecord(log::kBlockSize - 4, 3); +} + +TEST(LogTest, ReadFourthMiddleBlock) { + CheckInitialOffsetRecord(log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthLastBlock) { + CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); +} + +TEST(LogTest, ReadFourthStart) { + CheckInitialOffsetRecord( + 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 3); +} + +TEST(LogTest, ReadEnd) { + CheckOffsetPastEndReturnsNoRecords(0); +} + +TEST(LogTest, ReadPastEnd) { + CheckOffsetPastEndReturnsNoRecords(5); +} + +TEST(LogTest, ClearEofSingleBlock) { + Write("foo"); + Write("bar"); + ForceEOF(3 + kHeaderSize + 2); + ASSERT_EQ("foo", Read()); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_TRUE(IsEOF()); + ASSERT_EQ("EOF", Read()); + Write("xxx"); + UnmarkEOF(); + ASSERT_EQ("xxx", Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST(LogTest, ClearEofMultiBlock) { + size_t num_full_blocks = 5; + size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25; + Write(BigString("foo", n)); + Write(BigString("bar", n)); + ForceEOF(n + num_full_blocks * kHeaderSize + 10); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_TRUE(IsEOF()); + UnmarkEOF(); + ASSERT_EQ(BigString("bar", n), Read()); + ASSERT_TRUE(IsEOF()); + Write(BigString("xxx", n)); + UnmarkEOF(); + ASSERT_EQ(BigString("xxx", n), Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST(LogTest, ClearEofError) { + // If an error occurs during Read() in UnmarkEOF(), the records contained + // in the buffer should be returned on subsequent calls of ReadRecord() + // until no more full records are left, whereafter ReadRecord() should return + // false to indicate that it cannot read any further. + + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + ASSERT_TRUE(IsEOF()); + Write("xxx"); + ForceError(0); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST(LogTest, ClearEofError2) { + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + Write("xxx"); + ForceError(3); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +} // namespace log +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/log_writer.cc b/db/log_writer.cc new file mode 100644 index 00000000..df601a47 --- /dev/null +++ b/db/log_writer.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace log { + +Writer::Writer(unique_ptr&& dest) + : dest_(std::move(dest)), + block_offset_(0) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + bool begin = true; + do { + const int leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < kHeaderSize) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize being 7) + assert(kHeaderSize == 7); + dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + } + block_offset_ = 0; + } + + // Invariant: we never leave < kHeaderSize bytes in a block. + assert(kBlockSize - block_offset_ - kHeaderSize >= 0); + + const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length); + if (begin && end) { + type = kFullType; + } else if (begin) { + type = kFirstType; + } else if (end) { + type = kLastType; + } else { + type = kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && left > 0); + return s; +} + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + // Format the header + char buf[kHeaderSize]; + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + // Compute the crc of the record type and the payload. + uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, kHeaderSize)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + if (s.ok()) { + s = dest_->Flush(); + } + } + block_offset_ += kHeaderSize + n; + return s; +} + +} // namespace log +} // namespace rocksdb diff --git a/db/log_writer.h b/db/log_writer.h new file mode 100644 index 00000000..d7b7afff --- /dev/null +++ b/db/log_writer.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class WritableFile; + +using std::unique_ptr; + +namespace log { + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(unique_ptr&& dest); + ~Writer(); + + Status AddRecord(const Slice& slice); + + WritableFile* file() { return dest_.get(); } + const WritableFile* file() const { return dest_.get(); } + + private: + unique_ptr dest_; + int block_offset_; // Current offset in block + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // No copying allowed + Writer(const Writer&); + void operator=(const Writer&); +}; + +} // namespace log +} // namespace rocksdb diff --git a/db/memtable.cc b/db/memtable.cc new file mode 100644 index 00000000..0d949cb5 --- /dev/null +++ b/db/memtable.cc @@ -0,0 +1,591 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" + +#include +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice_transform.h" +#include "util/arena.h" +#include "util/coding.h" +#include "util/murmurhash.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/statistics.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) + : comparator_(cmp), + refs_(0), + kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), + kWriteBufferSize(options.write_buffer_size), + arena_(options.arena_block_size), + table_(options.memtable_factory->CreateMemTableRep( + comparator_, &arena_, options.prefix_extractor.get())), + flush_in_progress_(false), + flush_completed_(false), + file_number_(0), + first_seqno_(0), + mem_next_logfile_number_(0), + mem_logfile_number_(0), + locks_(options.inplace_update_support ? options.inplace_update_num_locks + : 0), + prefix_extractor_(options.prefix_extractor.get()), + should_flush_(ShouldFlushNow()) { + // if should_flush_ == true without an entry inserted, something must have + // gone wrong already. + assert(!should_flush_); + if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { + prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits, + options.bloom_locality, + options.memtable_prefix_bloom_probes)); + } +} + +MemTable::~MemTable() { + assert(refs_ == 0); +} + +size_t MemTable::ApproximateMemoryUsage() { + return arena_.ApproximateMemoryUsage() + table_->ApproximateMemoryUsage(); +} + +bool MemTable::ShouldFlushNow() const { + // In a lot of times, we cannot allocate arena blocks that exactly matches the + // buffer size. Thus we have to decide if we should over-allocate or + // under-allocate. + // This constant avariable can be interpreted as: if we still have more than + // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over + // allocate one more block. + const double kAllowOverAllocationRatio = 0.6; + + // If arena still have room for new block allocation, we can safely say it + // shouldn't flush. + auto allocated_memory = + table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes(); + + // if we can still allocate one more block without exceeding the + // over-allocation ratio, then we should not flush. + if (allocated_memory + kArenaBlockSize < + kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + return false; + } + + // if user keeps adding entries that exceeds kWriteBufferSize, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + return true; + } + + // In this code path, Arena has already allocated its "last block", which + // means the total allocatedmemory size is either: + // (1) "moderately" over allocated the memory (no more than `0.6 * arena + // block size`. Or, + // (2) the allocated memory is less than write buffer size, but we'll stop + // here since if we allocate a new arena block, we'll over allocate too much + // more (half of the arena block size) memory. + // + // In either case, to avoid over-allocate, the last block will stop allocation + // when its usage reaches a certain ratio, which we carefully choose "0.75 + // full" as the stop condition because it addresses the following issue with + // great simplicity: What if the next inserted entry's size is + // bigger than AllocatedAndUnused()? + // + // The answer is: if the entry size is also bigger than 0.25 * + // kArenaBlockSize, a dedicated block will be allocated for it; otherwise + // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty + // and regular block. In either case, we *overly* over-allocated. + // + // Therefore, setting the last block to be at most "0.75 full" avoids both + // cases. + // + // NOTE: the average percentage of waste space of this approach can be counted + // as: "arena block size * 0.25 / write buffer size". User who specify a small + // write buffer size and/or big arena block size may suffer. + return arena_.AllocatedAndUnused() < kArenaBlockSize / 4; +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const { + // Internal keys are encoded as length-prefixed strings. + Slice k1 = GetLengthPrefixedSlice(prefix_len_key1); + Slice k2 = GetLengthPrefixedSlice(prefix_len_key2); + return comparator.Compare(k1, k2); +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key, + const Slice& key) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.Compare(a, key); +} + +Slice MemTableRep::UserKey(const char* key) const { + Slice slice = GetLengthPrefixedSlice(key); + return Slice(slice.data(), slice.size() - 8); +} + +KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { + *buf = arena_->Allocate(len); + return static_cast(*buf); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, target.size()); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator: public Iterator { + public: + MemTableIterator(const MemTable& mem, const ReadOptions& options) + : bloom_(nullptr), + prefix_extractor_(mem.prefix_extractor_), + iter_(), + valid_(false) { + if (options.prefix) { + iter_.reset(mem.table_->GetPrefixIterator(*options.prefix)); + } else if (options.prefix_seek) { + bloom_ = mem.prefix_bloom_.get(); + iter_.reset(mem.table_->GetDynamicPrefixIterator()); + } else { + iter_.reset(mem.table_->GetIterator()); + } + } + + virtual bool Valid() const { return valid_; } + virtual void Seek(const Slice& k) { + if (bloom_ != nullptr && + !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) { + valid_ = false; + return; + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + valid_ = iter_->Valid(); + } + virtual void SeekToLast() { + iter_->SeekToLast(); + valid_ = iter_->Valid(); + } + virtual void Next() { + assert(Valid()); + iter_->Next(); + valid_ = iter_->Valid(); + } + virtual void Prev() { + assert(Valid()); + iter_->Prev(); + valid_ = iter_->Valid(); + } + virtual Slice key() const { + assert(Valid()); + return GetLengthPrefixedSlice(iter_->key()); + } + virtual Slice value() const { + assert(Valid()); + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + virtual Status status() const { return Status::OK(); } + + private: + DynamicBloom* bloom_; + const SliceTransform* const prefix_extractor_; + std::shared_ptr iter_; + bool valid_; + + // No copying allowed + MemTableIterator(const MemTableIterator&); + void operator=(const MemTableIterator&); +}; + +Iterator* MemTable::NewIterator(const ReadOptions& options) { + return new MemTableIterator(*this, options); +} + +port::RWMutex* MemTable::GetLock(const Slice& key) { + static murmur_hash hash; + return &locks_[hash(key) % locks_.size()]; +} + +void MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + size_t key_size = key.size(); + size_t val_size = value.size(); + size_t internal_key_size = key_size + 8; + const size_t encoded_len = + VarintLength(internal_key_size) + internal_key_size + + VarintLength(val_size) + val_size; + char* buf = nullptr; + KeyHandle handle = table_->Allocate(encoded_len, &buf); + assert(buf != nullptr); + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + p += key_size; + EncodeFixed64(p, (s << 8) | type); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + table_->Insert(handle); + + if (prefix_bloom_) { + assert(prefix_extractor_); + prefix_bloom_->Add(prefix_extractor_->Transform(key)); + } + + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s > first_seqno_); + if (first_seqno_ == 0) { + first_seqno_ = s; + } + + should_flush_ = ShouldFlushNow(); +} + +// Callback from MemTable::Get() +namespace { + +struct Saver { + Status* status; + const LookupKey* key; + bool* found_final_value; // Is value set correctly? Used by KeyMayExist + bool* merge_in_progress; + std::string* value; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + MemTable* mem; + Logger* logger; + Statistics* statistics; + bool inplace_update_support; +}; +} // namespace + +static bool SaveValue(void* arg, const char* entry) { + Saver* s = reinterpret_cast(arg); + MergeContext* merge_context = s->merge_context; + const MergeOperator* merge_operator = s->merge_operator; + + assert(s != nullptr && merge_context != nullptr); + + // entry format is: + // klength varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (s->mem->GetInternalKeyComparator().user_comparator()->Compare( + Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->status) = Status::OK(); + if (*(s->merge_in_progress)) { + assert(merge_operator); + if (!merge_operator->FullMerge(s->key->user_key(), &v, + merge_context->GetOperands(), s->value, + s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + *(s->status) = + Status::Corruption("Error: Could not perform merge."); + } + } else { + s->value->assign(v.data(), v.size()); + } + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->Unlock(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeDeletion: { + if (*(s->merge_in_progress)) { + assert(merge_operator); + *(s->status) = Status::OK(); + if (!merge_operator->FullMerge(s->key->user_key(), nullptr, + merge_context->GetOperands(), s->value, + s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + *(s->status) = + Status::Corruption("Error: Could not perform merge."); + } + } else { + *(s->status) = Status::NotFound(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeMerge: { + std::string merge_result; // temporary area for merge results later + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->merge_in_progress) = true; + merge_context->PushOperand(v); + return true; + } + default: + assert(false); + return true; + } + } + + // s->state could be Corrupt, merge or notfound + return false; +} + +bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options) { + StopWatchNano memtable_get_timer(options.env, false); + StartPerfTimer(&memtable_get_timer); + + Slice user_key = key.user_key(); + bool found_final_value = false; + bool merge_in_progress = s->IsMergeInProgress(); + + if (prefix_bloom_ && + !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) { + // iter is null if prefix bloom says the key does not exist + } else { + Saver saver; + saver.status = s; + saver.found_final_value = &found_final_value; + saver.merge_in_progress = &merge_in_progress; + saver.key = &key; + saver.value = value; + saver.status = s; + saver.mem = this; + saver.merge_context = &merge_context; + saver.merge_operator = options.merge_operator.get(); + saver.logger = options.info_log.get(); + saver.inplace_update_support = options.inplace_update_support; + saver.statistics = options.statistics.get(); + table_->Get(key, &saver, SaveValue); + } + + // No change to value, since we have not yet found a Put/Delete + if (!found_final_value && merge_in_progress) { + *s = Status::MergeInProgress(""); + } + BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer); + BumpPerfCount(&perf_context.get_from_memtable_count); + return found_final_value; +} + +void MemTable::Update(SequenceNumber seq, + const Slice& key, + const Slice& value) { + LookupKey lkey(key, seq); + Slice mem_key = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetIterator(lkey.user_key())); + iter->Seek(lkey.internal_key(), mem_key.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = prev_value.size(); + uint32_t new_size = value.size(); + + // Update value, if new value size <= previous value size + if (new_size <= prev_size ) { + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_size); + WriteLock wl(GetLock(lkey.user_key())); + memcpy(p, value.data(), value.size()); + assert((unsigned)((p + value.size()) - entry) == + (unsigned)(VarintLength(key_length) + key_length + + VarintLength(value.size()) + value.size())); + return; + } + } + default: + // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData + // we don't have enough space for update inplace + Add(seq, kTypeValue, key, value); + return; + } + } + } + + // key doesn't exist + Add(seq, kTypeValue, key, value); +} + +bool MemTable::UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta, + const Options& options) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::shared_ptr iter( + table_->GetIterator(lkey.user_key())); + iter->Seek(lkey.internal_key(), memkey.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + switch (static_cast(tag & 0xff)) { + case kTypeValue: { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = prev_value.size(); + + char* prev_buffer = const_cast(prev_value.data()); + uint32_t new_prev_size = prev_size; + + std::string str_value; + WriteLock wl(GetLock(lkey.user_key())); + auto status = options.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // Value already updated by callback. + assert(new_prev_size <= prev_size); + if (new_prev_size < prev_size) { + // overwrite the new prev_size + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_prev_size); + if (VarintLength(new_prev_size) < VarintLength(prev_size)) { + // shift the value buffer as well. + memcpy(p, prev_buffer, new_prev_size); + } + } + RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); + should_flush_ = ShouldFlushNow(); + return true; + } else if (status == UpdateStatus::UPDATED) { + Add(seq, kTypeValue, key, Slice(str_value)); + RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); + should_flush_ = ShouldFlushNow(); + return true; + } else if (status == UpdateStatus::UPDATE_FAILED) { + // No action required. Return. + should_flush_ = ShouldFlushNow(); + return true; + } + } + default: + break; + } + } + } + // If the latest value is not kTypeValue + // or key doesn't exist + return false; +} + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::unique_ptr iter( + table_->GetIterator(key.user_key())); + iter->Seek(key.internal_key(), memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Compare( + Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + if (static_cast(tag & 0xff) != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + +void MemTableRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto iter = GetIterator(k.user_key()); + for (iter->Seek(k.internal_key(), k.memtable_key().data()); + iter->Valid() && callback_func(callback_args, iter->key()); + iter->Next()) { + } +} + +} // namespace rocksdb diff --git a/db/memtable.h b/db/memtable.h new file mode 100644 index 00000000..451def38 --- /dev/null +++ b/db/memtable.h @@ -0,0 +1,220 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "db/dbformat.h" +#include "db/skiplist.h" +#include "db/version_set.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "util/dynamic_bloom.h" + +namespace rocksdb { + +class Mutex; +class MemTableIterator; +class MergeContext; + +class MemTable { + public: + struct KeyComparator : public MemTableRep::KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const; + virtual int operator()(const char* prefix_len_key, + const Slice& key) const override; + }; + + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. + explicit MemTable(const InternalKeyComparator& comparator, + const Options& options = Options()); + + ~MemTable(); + + // Increase reference count. + void Ref() { ++refs_; } + + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null + MemTable* Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + return this; + } + return nullptr; + } + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + size_t ApproximateMemoryUsage(); + + // This method heuristically determines if the memtable should continue to + // host more data. + bool ShouldFlush() const { return should_flush_; } + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/dbformat.{h,cc} module. + // + // If options.prefix is supplied, it is passed to the underlying MemTableRep + // as a hint that the iterator only need to support access to keys with that + // specific prefix. + // If options.prefix is not supplied and options.prefix_seek is set, the + // iterator is not bound to a specific prefix. However, the semantics of + // Seek is changed - the result might only include keys with the same prefix + // as the seek-key. + Iterator* NewIterator(const ReadOptions& options = ReadOptions()); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + void Add(SequenceNumber seq, ValueType type, + const Slice& key, + const Slice& value); + + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // If memtable contains Merge operation as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operand to *operands. + // store MergeInProgress in s, and return false. + // Else, return false. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options); + + // Attempts to update the new_value inplace, else does normal Add + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // if new sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else add(key, new_value) + void Update(SequenceNumber seq, + const Slice& key, + const Slice& value); + + // If prev_value for key exits, attempts to update it inplace. + // else returns false + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // new_value = delta(prev_value) + // if sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else return false + bool UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta, + const Options& options); + + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + + // Returns the edits area that is needed for flushing the memtable + VersionEdit* GetEdits() { return &edit_; } + + // Returns the sequence number of the first element that was inserted + // into the memtable + SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } + + // Returns the next active logfile number when this memtable is about to + // be flushed to storage + uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } + + // Sets the next active logfile number when this memtable is about to + // be flushed to storage + void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + + // Returns the logfile number that can be safely deleted when this + // memstore is flushed to storage + uint64_t GetLogNumber() { return mem_logfile_number_; } + + // Sets the logfile number that can be safely deleted when this + // memstore is flushed to storage + void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; } + + // Notify the underlying storage that no more items will be added + void MarkImmutable() { table_->MarkReadOnly(); } + + // Get the lock associated for the key + port::RWMutex* GetLock(const Slice& key); + + const InternalKeyComparator& GetInternalKeyComparator() const { + return comparator_.comparator; + } + + const Arena& TEST_GetArena() const { return arena_; } + + private: + // Dynamically check if we can add more incoming entries. + bool ShouldFlushNow() const; + + friend class MemTableIterator; + friend class MemTableBackwardIterator; + friend class MemTableList; + + KeyComparator comparator_; + int refs_; + const size_t kArenaBlockSize; + const size_t kWriteBufferSize; + Arena arena_; + unique_ptr table_; + + // These are used to manage memtable flushes to storage + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush + uint64_t file_number_; // filled up after flush is complete + + // The updates to be applied to the transaction log when this + // memtable is flushed to storage. + VersionEdit edit_; + + // The sequence number of the kv that was inserted first + SequenceNumber first_seqno_; + + // The log files earlier than this number can be deleted. + uint64_t mem_next_logfile_number_; + + // The log file that backs this memtable (to be deleted when + // memtable flush is done) + uint64_t mem_logfile_number_; + + // rw locks for inplace updates + std::vector locks_; + + // No copying allowed + MemTable(const MemTable&); + void operator=(const MemTable&); + + const SliceTransform* const prefix_extractor_; + std::unique_ptr prefix_bloom_; + + // a flag indicating if a memtable has met the criteria to flush + bool should_flush_; +}; + +extern const char* EncodeKey(std::string* scratch, const Slice& target); + +} // namespace rocksdb diff --git a/db/memtable_list.cc b/db/memtable_list.cc new file mode 100644 index 00000000..3c502c6d --- /dev/null +++ b/db/memtable_list.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/memtable_list.h" + +#include +#include "rocksdb/db.h" +#include "db/memtable.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "util/coding.h" +#include "util/log_buffer.h" + +namespace rocksdb { + +class InternalKeyComparator; +class Mutex; +class VersionSet; + +MemTableListVersion::MemTableListVersion(MemTableListVersion* old) { + if (old != nullptr) { + memlist_ = old->memlist_; + size_ = old->size_; + for (auto& m : memlist_) { + m->Ref(); + } + } +} + +void MemTableListVersion::Ref() { ++refs_; } + +void MemTableListVersion::Unref(autovector* to_delete) { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + // if to_delete is equal to nullptr it means we're confident + // that refs_ will not be zero + assert(to_delete != nullptr); + for (const auto& m : memlist_) { + MemTable* x = m->Unref(); + if (x != nullptr) { + to_delete->push_back(x); + } + } + delete this; + } +} + +int MemTableListVersion::size() const { return size_; } + +// Returns the total number of memtables in the list +int MemTableList::size() const { + assert(num_flush_not_started_ <= current_->size_); + return current_->size_; +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableListVersion::Get(const LookupKey& key, std::string* value, + Status* s, MergeContext& merge_context, + const Options& options) { + for (auto& memtable : memlist_) { + if (memtable->Get(key, value, s, merge_context, options)) { + return true; + } + } + return false; +} + +void MemTableListVersion::AddIterators(const ReadOptions& options, + std::vector* iterator_list) { + for (auto& m : memlist_) { + iterator_list->push_back(m->NewIterator(options)); + } +} + +// caller is responsible for referencing m +void MemTableListVersion::Add(MemTable* m) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.push_front(m); + ++size_; +} + +// caller is responsible for unreferencing m +void MemTableListVersion::Remove(MemTable* m) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.remove(m); + --size_; +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending() const { + if ((flush_requested_ && num_flush_not_started_ >= 1) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { + assert(imm_flush_needed.NoBarrier_Load() != nullptr); + return true; + } + return false; +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(autovector* ret) { + const auto& memlist = current_->memlist_; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.Release_Store(nullptr); + } + m->flush_in_progress_ = true; // flushing will start very soon + ret->push_back(m); + } + } + flush_requested_ = false; // start-flush request is complete +} + +void MemTableList::RollbackMemtableFlush(const autovector& mems, + uint64_t file_number, std::set* pending_outputs) { + assert(!mems.empty()); + + // If the flush was not successful, then just reset state. + // Maybe a suceeding attempt to flush will be successful. + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + pending_outputs->erase(file_number); + imm_flush_needed.Release_Store(reinterpret_cast(1)); +} + +// Record a successful flush in the manifest file +Status MemTableList::InstallMemtableFlushResults( + const autovector& mems, VersionSet* vset, port::Mutex* mu, + Logger* info_log, uint64_t file_number, std::set& pending_outputs, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer) { + mu->AssertHeld(); + + // flush was sucessful + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already commiting, then return + Status s; + if (commit_in_progress_) { + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memetables + // are always committed in the order that they were created. + while (!current_->memlist_.empty() && s.ok()) { + MemTable* m = current_->memlist_.back(); // get the last element + if (!m->flush_completed_) { + break; + } + + LogToBuffer(log_buffer, "Level-0 commit table #%lu started", + (unsigned long)m->file_number_); + + // this can release and reacquire the mutex. + s = vset->LogAndApply(&m->edit_, mu, db_directory); + + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables has been flushed. + do { + if (s.ok()) { // commit new state + LogToBuffer(log_buffer, "Level-0 commit table #%lu: memtable #%lu done", + (unsigned long)m->file_number_, (unsigned long)mem_id); + current_->Remove(m); + assert(m->file_number_ > 0); + + // pending_outputs can be cleared only after the newly created file + // has been written to a committed version so that other concurrently + // executing compaction threads do not mistakenly assume that this + // file is not live. + pending_outputs.erase(m->file_number_); + if (m->Unref() != nullptr) { + to_delete->push_back(m); + } + } else { + //commit failed. setup state so that we can flush again. + Log(info_log, + "Level-0 commit table #%lu: memtable #%lu failed", + (unsigned long)m->file_number_, + (unsigned long)mem_id); + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + pending_outputs.erase(m->file_number_); + m->file_number_ = 0; + imm_flush_needed.Release_Store((void *)1); + } + ++mem_id; + } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) && + m->file_number_ == file_number); + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m) { + assert(current_->size_ >= num_flush_not_started_); + InstallNewVersion(); + // this method is used to move mutable memtable into an immutable list. + // since mutable memtable is already refcounted by the DBImpl, + // and when moving to the imutable list we don't unref it, + // we don't have to ref the memtable here. we just take over the + // reference from the DBImpl. + current_->Add(m); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.Release_Store((void *)1); + } +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateMemoryUsage() { + size_t size = 0; + for (auto& memtable : current_->memlist_) { + size += memtable->ApproximateMemoryUsage(); + } + return size; +} + +void MemTableList::InstallNewVersion() { + if (current_->refs_ == 1) { + // we're the only one using the version, just keep using it + } else { + // somebody else holds the current version, we need to create new one + MemTableListVersion* version = current_; + current_ = new MemTableListVersion(current_); + current_->Ref(); + version->Unref(); + } +} + +} // namespace rocksdb diff --git a/db/memtable_list.h b/db/memtable_list.h new file mode 100644 index 00000000..0bf376e5 --- /dev/null +++ b/db/memtable_list.h @@ -0,0 +1,143 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/skiplist.h" +#include "rocksdb/db.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "util/autovector.h" + +namespace rocksdb { + +class InternalKeyComparator; +class Mutex; + +// keeps a list of immutable memtables in a vector. the list is immutable +// if refcount is bigger than one. It is used as a state for Get() and +// Iterator code paths +class MemTableListVersion { + public: + explicit MemTableListVersion(MemTableListVersion* old = nullptr); + + void Ref(); + void Unref(autovector* to_delete = nullptr); + + int size() const; + + // Search all the memtables starting from the most recent one. + // Return the most recent value found, if any. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext& merge_context, const Options& options); + + void AddIterators(const ReadOptions& options, + std::vector* iterator_list); + + private: + // REQUIRE: m is mutable memtable + void Add(MemTable* m); + // REQUIRE: m is mutable memtable + void Remove(MemTable* m); + + friend class MemTableList; + std::list memlist_; + int size_ = 0; + int refs_ = 0; +}; + +// This class stores references to all the immutable memtables. +// The memtables are flushed to L0 as soon as possible and in +// any order. If there are more than one immutable memtable, their +// flushes can occur concurrently. However, they are 'committed' +// to the manifest in FIFO order to maintain correctness and +// recoverability from a crash. +class MemTableList { + public: + // A list of memtables. + explicit MemTableList(int min_write_buffer_number_to_merge) + : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), + current_(new MemTableListVersion()), + num_flush_not_started_(0), + commit_in_progress_(false), + flush_requested_(false) { + imm_flush_needed.Release_Store(nullptr); + current_->Ref(); + } + ~MemTableList() {} + + MemTableListVersion* current() { return current_; } + + // so that background threads can detect non-nullptr pointer to + // determine whether there is anything more to start flushing. + port::AtomicPointer imm_flush_needed; + + // Returns the total number of memtables in the list + int size() const; + + // Returns true if there is at least one memtable on which flush has + // not yet started. + bool IsFlushPending() const; + + // Returns the earliest memtables that needs to be flushed. The returned + // memtables are guaranteed to be in the ascending order of created time. + void PickMemtablesToFlush(autovector* mems); + + // Reset status of the given memtable list back to pending state so that + // they can get picked up again on the next round of flush. + void RollbackMemtableFlush(const autovector& mems, + uint64_t file_number, + std::set* pending_outputs); + + // Commit a successful flush in the manifest file + Status InstallMemtableFlushResults(const autovector& m, + VersionSet* vset, port::Mutex* mu, + Logger* info_log, uint64_t file_number, + std::set& pending_outputs, + autovector* to_delete, + Directory* db_directory, + LogBuffer* log_buffer); + + // New memtables are inserted at the front of the list. + // Takes ownership of the referenced held on *m by the caller of Add(). + void Add(MemTable* m); + + // Returns an estimate of the number of bytes of data in use. + size_t ApproximateMemoryUsage(); + + // Request a flush of all existing memtables to storage + void FlushRequested() { flush_requested_ = true; } + + // Copying allowed + // MemTableList(const MemTableList&); + // void operator=(const MemTableList&); + + private: + // DB mutex held + void InstallNewVersion(); + + int min_write_buffer_number_to_merge_; + + MemTableListVersion* current_; + + // the number of elements that still need flushing + int num_flush_not_started_; + + // committing in progress + bool commit_in_progress_; + + // Requested a flush of all memtables to storage + bool flush_requested_; + +}; + +} // namespace rocksdb diff --git a/db/merge_context.h b/db/merge_context.h new file mode 100644 index 00000000..bf483a82 --- /dev/null +++ b/db/merge_context.h @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +const std::deque empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { +public: + // Clear all the operands + void Clear() { + if (operand_list) { + operand_list->clear(); + } + } + // Replace all operands with merge_result, which are expected to be the + // merge result of them. + void PushPartialMergeResult(std::string& merge_result) { + assert (operand_list); + operand_list->clear(); + operand_list->push_front(std::move(merge_result)); + } + // Push a merge operand + void PushOperand(const Slice& operand_slice) { + Initialize(); + operand_list->push_front(operand_slice.ToString()); + } + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list) { + return 0; + } + return operand_list->size(); + } + // Get the operand at the index. + Slice GetOperand(int index) const { + assert (operand_list); + return (*operand_list)[index]; + } + // Return all the operands. + const std::deque& GetOperands() const { + if (!operand_list) { + return empty_operand_list; + } + return *operand_list; + } +private: + void Initialize() { + if (!operand_list) { + operand_list.reset(new std::deque()); + } + } + std::unique_ptr> operand_list; +}; + +} // namespace rocksdb + diff --git a/db/merge_helper.cc b/db/merge_helper.cc new file mode 100644 index 00000000..0e36f6ae --- /dev/null +++ b/db/merge_helper.cc @@ -0,0 +1,209 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "merge_helper.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "util/statistics.h" +#include +#include + +namespace rocksdb { + +// PRE: iter points to the first merge type entry +// POST: iter points to the first entry beyond the merge process (or the end) +// keys_, operands_ are updated to reflect the merge result. +// keys_ stores the list of keys encountered while merging. +// operands_ stores the list of merge operands encountered while merging. +// keys_[i] corresponds to operands_[i] for each i. +void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, + bool at_bottom, Statistics* stats, int* steps) { + // Get a copy of the internal key, before it's invalidated by iter->Next() + // Also maintain the list of merge operands seen. + keys_.clear(); + operands_.clear(); + keys_.push_front(iter->key().ToString()); + operands_.push_front(iter->value().ToString()); + + success_ = false; // Will become true if we hit Put/Delete or bottom + + // We need to parse the internal key again as the parsed key is + // backed by the internal key! + // Assume no internal key corruption as it has been successfully parsed + // by the caller. + // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid. + ParsedInternalKey orig_ikey; + ParseInternalKey(keys_.back(), &orig_ikey); + + bool hit_the_next_user_key = false; + std::string merge_result; // Temporary value for merge results + if (steps) { + ++(*steps); + } + for (iter->Next(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + assert(operands_.size() >= 1); // Should be invariants! + assert(keys_.size() == operands_.size()); + + if (!ParseInternalKey(iter->key(), &ikey)) { + // stop at corrupted key + if (assert_valid_internal_key_) { + assert(!"corrupted internal key is not expected"); + } + break; + } + + if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) { + // hit a different user key, stop right here + hit_the_next_user_key = true; + break; + } + + if (stop_before && ikey.sequence <= stop_before) { + // hit an entry that's visible by the previous snapshot, can't touch that + break; + } + + // At this point we are guaranteed that we need to process this key. + + if (kTypeDeletion == ikey.type) { + // hit a delete + // => merge nullptr with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Return a success if the merge passes. + success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr, + operands_, &merge_result, + logger_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + swap(operands_.back(), merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + } + + // move iter to the next entry (before doing anything else) + iter->Next(); + if (steps) { + ++(*steps); + } + return; + } + + if (kTypeValue == ikey.type) { + // hit a put + // => merge the put value with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Success! + const Slice value = iter->value(); + success_ = user_merge_operator_->FullMerge(ikey.user_key, &value, + operands_, &merge_result, + logger_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + swap(operands_.back(), merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + } + + // move iter to the next entry + iter->Next(); + if (steps) { + ++(*steps); + } + return; + } + + if (kTypeMerge == ikey.type) { + // hit a merge + // => merge the operand into the front of the operands_ list + // => use the user's associative merge function to determine how. + // => then continue because we haven't yet seen a Put/Delete. + assert(!operands_.empty()); // Should have at least one element in it + + // keep queuing keys and operands until we either meet a put / delete + // request or later did a partial merge. + keys_.push_front(iter->key().ToString()); + operands_.push_front(iter->value().ToString()); + if (steps) { + ++(*steps); + } + } + } + + // We are sure we have seen this key's entire history if we are at the + // last level and exhausted all internal keys of this user key. + // NOTE: !iter->Valid() does not necessarily mean we hit the + // beginning of a user key, as versions of a user key might be + // split into multiple files (even files on the same level) + // and some files might not be included in the compaction/merge. + // + // There are also cases where we have seen the root of history of this + // key without being sure of it. Then, we simply miss the opportunity + // to combine the keys. Since VersionSet::SetupOtherInputs() always makes + // sure that all merge-operands on the same level get compacted together, + // this will simply lead to these merge operands moving to the next level. + // + // So, we only perform the following logic (to merge all operands together + // without a Put/Delete) if we are certain that we have seen the end of key. + bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom; + if (surely_seen_the_beginning) { + // do a final merge with nullptr as the existing value and say + // bye to the merge type (it's now converted to a Put) + assert(kTypeMerge == orig_ikey.type); + assert(operands_.size() >= 1); + assert(operands_.size() == keys_.size()); + success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr, + operands_, &merge_result, + logger_); + + if (success_) { + std::string& key = keys_.back(); // The original key encountered + orig_ikey.type = kTypeValue; + UpdateInternalKey(&key[0], key.size(), + orig_ikey.sequence, orig_ikey.type); + + // The final value() is always stored in operands_.back() + swap(operands_.back(),merge_result); + } else { + RecordTick(stats, NUMBER_MERGE_FAILURES); + // Do nothing if not success_. Leave keys() and operands() as they are. + } + } else { + // We haven't seen the beginning of the key nor a Put/Delete. + // Attempt to use the user's associative merge function to + // merge the stacked merge operands into a single operand. + + if (operands_.size() >= 2 && + operands_.size() >= min_partial_merge_operands_ && + user_merge_operator_->PartialMergeMulti( + orig_ikey.user_key, + std::deque(operands_.begin(), operands_.end()), + &merge_result, logger_)) { + // Merging of operands (associative merge) was successful. + // Replace operands with the merge result + operands_.clear(); + operands_.push_front(std::move(merge_result)); + keys_.erase(keys_.begin(), keys_.end() - 1); + } + } +} + +} // namespace rocksdb diff --git a/db/merge_helper.h b/db/merge_helper.h new file mode 100644 index 00000000..fef153eb --- /dev/null +++ b/db/merge_helper.h @@ -0,0 +1,105 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef MERGE_HELPER_H +#define MERGE_HELPER_H + +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include +#include + +namespace rocksdb { + +class Comparator; +class Iterator; +class Logger; +class MergeOperator; +class Statistics; + +class MergeHelper { + public: + MergeHelper(const Comparator* user_comparator, + const MergeOperator* user_merge_operator, Logger* logger, + unsigned min_partial_merge_operands, + bool assert_valid_internal_key) + : user_comparator_(user_comparator), + user_merge_operator_(user_merge_operator), + logger_(logger), + min_partial_merge_operands_(min_partial_merge_operands), + assert_valid_internal_key_(assert_valid_internal_key), + keys_(), + operands_(), + success_(false) {} + + // Merge entries until we hit + // - a corrupted key + // - a Put/Delete, + // - a different user key, + // - a specific sequence number (snapshot boundary), + // or - the end of iteration + // iter: (IN) points to the first merge type entry + // (OUT) points to the first entry not included in the merge process + // stop_before: (IN) a sequence number that merge should not cross. + // 0 means no restriction + // at_bottom: (IN) true if the iterator covers the bottem level, which means + // we could reach the start of the history of this user key. + void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0, + bool at_bottom = false, Statistics* stats = nullptr, + int* steps = nullptr); + + // Query the merge result + // These are valid until the next MergeUntil call + // If the merging was successful: + // - IsSuccess() will be true + // - key() will have the latest sequence number of the merges. + // The type will be Put or Merge. See IMPORTANT 1 note, below. + // - value() will be the result of merging all the operands together + // - The user should ignore keys() and values(). + // + // IMPORTANT 1: the key type could change after the MergeUntil call. + // Put/Delete + Merge + ... + Merge => Put + // Merge + ... + Merge => Merge + // + // If the merge operator is not associative, and if a Put/Delete is not found + // then the merging will be unsuccessful. In this case: + // - IsSuccess() will be false + // - keys() contains the list of internal keys seen in order of iteration. + // - values() contains the list of values (merges) seen in the same order. + // values() is parallel to keys() so that the first entry in + // keys() is the key associated with the first entry in values() + // and so on. These lists will be the same length. + // All of these pairs will be merges over the same user key. + // See IMPORTANT 2 note below. + // - The user should ignore key() and value(). + // + // IMPORTANT 2: The entries were traversed in order from BACK to FRONT. + // So keys().back() was the first key seen by iterator. + // TODO: Re-style this comment to be like the first one + bool IsSuccess() { return success_; } + Slice key() { assert(success_); return Slice(keys_.back()); } + Slice value() { assert(success_); return Slice(operands_.back()); } + const std::deque& keys() { assert(!success_); return keys_; } + const std::deque& values() { + assert(!success_); return operands_; + } + + private: + const Comparator* user_comparator_; + const MergeOperator* user_merge_operator_; + Logger* logger_; + unsigned min_partial_merge_operands_; + bool assert_valid_internal_key_; // enforce no internal key corruption? + + // the scratch area that holds the result of MergeUntil + // valid up to the next MergeUntil call + std::deque keys_; // Keeps track of the sequence of keys seen + std::deque operands_; // Parallel with keys_; stores the values + bool success_; +}; + +} // namespace rocksdb + +#endif diff --git a/db/merge_operator.cc b/db/merge_operator.cc new file mode 100644 index 00000000..a14df8a8 --- /dev/null +++ b/db/merge_operator.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/** + * Back-end implementation details specific to the Merge Operator. + */ + +#include "rocksdb/merge_operator.h" + +namespace rocksdb { + +// The default implementation of PartialMergeMulti, which invokes +// PartialMerge multiple times internally and merges two operands at +// a time. +bool MergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + assert(operand_list.size() >= 2); + // Simply loop through the operands + std::string temp_value; + Slice temp_slice(operand_list[0]); + + for (size_t i = 1; i < operand_list.size(); ++i) { + auto& operand = operand_list[i]; + if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_slice = Slice(*new_value); + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Given a "real" merge from the library, call the user's +// associative merge function one-by-one on each of the operands. +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + + // Simply loop through the operands + Slice temp_existing; + std::string temp_value; + for (const auto& operand : operand_list) { + Slice value(operand); + if (!Merge(key, existing_value, value, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_existing = Slice(*new_value); + existing_value = &temp_existing; + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Call the user defined simple merge on the operands; +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::PartialMerge( + const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + return Merge(key, &left_operand, right_operand, new_value, logger); +} + +} // namespace rocksdb diff --git a/db/merge_test.cc b/db/merge_test.cc new file mode 100644 index 00000000..4b98f058 --- /dev/null +++ b/db/merge_test.cc @@ -0,0 +1,464 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/write_batch_internal.h" +#include "utilities/merge_operators.h" +#include "util/testharness.h" +#include "utilities/utility_db.h" + +using namespace std; +using namespace rocksdb; + +namespace { + int numMergeOperatorCalls; + void resetNumMergeOperatorCalls() { + numMergeOperatorCalls = 0; + } + + int num_partial_merge_calls; + void resetNumPartialMergeCalls() { + num_partial_merge_calls = 0; + } +} + +class CountMergeOperator : public AssociativeMergeOperator { + public: + CountMergeOperator() { + mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); + } + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + ++numMergeOperatorCalls; + if (existing_value == nullptr) { + new_value->assign(value.data(), value.size()); + return true; + } + + return mergeOperator_->PartialMerge( + key, + *existing_value, + value, + new_value, + logger); + } + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + ++num_partial_merge_calls; + return mergeOperator_->PartialMergeMulti(key, operand_list, new_value, + logger); + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + std::shared_ptr mergeOperator_; +}; + +std::shared_ptr OpenDb(const string& dbname, const bool ttl = false, + const size_t max_successive_merges = 0, + const uint32_t min_partial_merge_operands = 2) { + DB* db; + StackableDB* sdb; + Options options; + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + options.max_successive_merges = max_successive_merges; + options.min_partial_merge_operands = min_partial_merge_operands; + Status s; + DestroyDB(dbname, Options()); + if (ttl) { + cout << "Opening database with TTL\n"; + s = UtilityDB::OpenTtlDB(options, dbname, &sdb); + db = sdb; + } else { + s = DB::Open(options, dbname, &db); + } + if (!s.ok()) { + cerr << s.ToString() << endl; + assert(false); + } + return std::shared_ptr(db); +} + +// Imagine we are maintaining a set of uint64 counters. +// Each counter has a distinct name. And we would like +// to support four high level operations: +// set, add, get and remove +// This is a quick implementation without a Merge operation. +class Counters { + + protected: + std::shared_ptr db_; + + WriteOptions put_option_; + ReadOptions get_option_; + WriteOptions delete_option_; + + uint64_t default_; + + public: + explicit Counters(std::shared_ptr db, uint64_t defaultCount = 0) + : db_(db), + put_option_(), + get_option_(), + delete_option_(), + default_(defaultCount) { + assert(db_); + } + + virtual ~Counters() {} + + // public interface of Counters. + // All four functions return false + // if the underlying level db operation failed. + + // mapped to a levedb Put + bool set(const string& key, uint64_t value) { + // just treat the internal rep of int64 as the string + Slice slice((char *)&value, sizeof(value)); + auto s = db_->Put(put_option_, key, slice); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << endl; + return false; + } + } + + // mapped to a rocksdb Delete + bool remove(const string& key) { + auto s = db_->Delete(delete_option_, key); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Get + bool get(const string& key, uint64_t *value) { + string str; + auto s = db_->Get(get_option_, key, &str); + + if (s.IsNotFound()) { + // return default value if not found; + *value = default_; + return true; + } else if (s.ok()) { + // deserialization + if (str.size() != sizeof(uint64_t)) { + cerr << "value corruption\n"; + return false; + } + *value = DecodeFixed64(&str[0]); + return true; + } else { + cerr << s.ToString() << std::endl; + return false; + } + } + + // 'add' is implemented as get -> modify -> set + // An alternative is a single merge operation, see MergeBasedCounters + virtual bool add(const string& key, uint64_t value) { + uint64_t base = default_; + return get(key, &base) && set(key, base + value); + } + + + // convenience functions for testing + void assert_set(const string& key, uint64_t value) { + assert(set(key, value)); + } + + void assert_remove(const string& key) { + assert(remove(key)); + } + + uint64_t assert_get(const string& key) { + uint64_t value = default_; + assert(get(key, &value)); + return value; + } + + void assert_add(const string& key, uint64_t value) { + assert(add(key, value)); + } +}; + +// Implement 'add' directly with the new Merge operation +class MergeBasedCounters : public Counters { + private: + WriteOptions merge_option_; // for merge + + public: + explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) + : Counters(db, defaultCount), + merge_option_() { + } + + // mapped to a rocksdb Merge operation + virtual bool add(const string& key, uint64_t value) override { + char encoded[sizeof(uint64_t)]; + EncodeFixed64(encoded, value); + Slice slice(encoded, sizeof(uint64_t)); + auto s = db_->Merge(merge_option_, key, slice); + + if (s.ok()) { + return true; + } else { + cerr << s.ToString() << endl; + return false; + } + } +}; + +void dumpDb(DB* db) { + auto it = unique_ptr(db->NewIterator(ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + uint64_t value = DecodeFixed64(it->value().data()); + cout << it->key().ToString() << ": " << value << endl; + } + assert(it->status().ok()); // Check for any errors found during the scan +} + +void testCounters(Counters& counters, DB* db, bool test_compaction) { + + FlushOptions o; + o.wait = true; + + counters.assert_set("a", 1); + + if (test_compaction) db->Flush(o); + + assert(counters.assert_get("a") == 1); + + counters.assert_remove("b"); + + // defaut value is 0 if non-existent + assert(counters.assert_get("b") == 0); + + counters.assert_add("a", 2); + + if (test_compaction) db->Flush(o); + + // 1+2 = 3 + assert(counters.assert_get("a")== 3); + + dumpDb(db); + + std::cout << "1\n"; + + // 1+...+49 = ? + uint64_t sum = 0; + for (int i = 1; i < 50; i++) { + counters.assert_add("b", i); + sum += i; + } + assert(counters.assert_get("b") == sum); + + std::cout << "2\n"; + dumpDb(db); + + std::cout << "3\n"; + + if (test_compaction) { + db->Flush(o); + + cout << "Compaction started ...\n"; + db->CompactRange(nullptr, nullptr); + cout << "Compaction ended\n"; + + dumpDb(db); + + assert(counters.assert_get("a")== 3); + assert(counters.assert_get("b") == sum); + } +} + +void testSuccessiveMerge( + Counters& counters, int max_num_merges, int num_merges) { + + counters.assert_remove("z"); + uint64_t sum = 0; + + for (int i = 1; i <= num_merges; ++i) { + resetNumMergeOperatorCalls(); + counters.assert_add("z", i); + sum += i; + + if (i % (max_num_merges + 1) == 0) { + assert(numMergeOperatorCalls == max_num_merges + 1); + } else { + assert(numMergeOperatorCalls == 0); + } + + resetNumMergeOperatorCalls(); + assert(counters.assert_get("z") == sum); + assert(numMergeOperatorCalls == i % (max_num_merges + 1)); + } +} + +void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge, + int count) { + FlushOptions o; + o.wait = true; + + // Test case 1: partial merge should be called when the number of merge + // operands exceeds the threshold. + uint64_t tmp_sum = 0; + resetNumPartialMergeCalls(); + for (int i = 1; i <= count; i++) { + counters->assert_add("b", i); + tmp_sum += i; + } + db->Flush(o); + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(tmp_sum, counters->assert_get("b")); + if (count > max_merge) { + // in this case, FullMerge should be called instead. + ASSERT_EQ(num_partial_merge_calls, 0); + } else { + // if count >= min_merge, then partial merge should be called once. + ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1)); + } + + // Test case 2: partial merge should not be called when a put is found. + resetNumPartialMergeCalls(); + tmp_sum = 0; + db->Put(rocksdb::WriteOptions(), "c", "10"); + for (int i = 1; i <= count; i++) { + counters->assert_add("c", i); + tmp_sum += i; + } + db->Flush(o); + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(tmp_sum, counters->assert_get("c")); + ASSERT_EQ(num_partial_merge_calls, 0); +} + +void testSingleBatchSuccessiveMerge( + DB* db, + int max_num_merges, + int num_merges) { + assert(num_merges > max_num_merges); + + Slice key("BatchSuccessiveMerge"); + uint64_t merge_value = 1; + Slice merge_value_slice((char *)&merge_value, sizeof(merge_value)); + + // Create the batch + WriteBatch batch; + for (int i = 0; i < num_merges; ++i) { + batch.Merge(key, merge_value_slice); + } + + // Apply to memtable and count the number of merges + resetNumMergeOperatorCalls(); + { + Status s = db->Write(WriteOptions(), &batch); + assert(s.ok()); + } + assert(numMergeOperatorCalls == + num_merges - (num_merges % (max_num_merges + 1))); + + // Get the value + resetNumMergeOperatorCalls(); + string get_value_str; + { + Status s = db->Get(ReadOptions(), key, &get_value_str); + assert(s.ok()); + } + assert(get_value_str.size() == sizeof(uint64_t)); + uint64_t get_value = DecodeFixed64(&get_value_str[0]); + ASSERT_EQ(get_value, num_merges * merge_value); + ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1))); +} + +void runTest(int argc, const string& dbname, const bool use_ttl = false) { + auto db = OpenDb(dbname, use_ttl); + + { + cout << "Test read-modify-write counters... \n"; + Counters counters(db, 0); + testCounters(counters, db.get(), true); + } + + bool compact = false; + if (argc > 1) { + compact = true; + cout << "Turn on Compaction\n"; + } + + { + cout << "Test merge-based counters... \n"; + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + } + + DestroyDB(dbname, Options()); + db.reset(); + + { + cout << "Test merge in memtable... \n"; + size_t max_merge = 5; + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + testSuccessiveMerge(counters, max_merge, max_merge * 2); + testSingleBatchSuccessiveMerge(db.get(), 5, 7); + DestroyDB(dbname, Options()); + } + + { + cout << "Test Partial-Merge\n"; + size_t max_merge = 100; + for (uint32_t min_merge = 5; min_merge < 25; min_merge += 5) { + for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) { + auto db = OpenDb(dbname, use_ttl, max_merge, min_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, count); + DestroyDB(dbname, Options()); + } + { + auto db = OpenDb(dbname, use_ttl, max_merge, min_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, + min_merge * 10); + DestroyDB(dbname, Options()); + } + } + } +} + +int main(int argc, char *argv[]) { + //TODO: Make this test like a general rocksdb unit-test + runTest(argc, test::TmpDir() + "/merge_testdb"); + runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database + printf("Passed all tests!\n"); + return 0; +} diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc new file mode 100644 index 00000000..a182fb52 --- /dev/null +++ b/db/perf_context_test.cc @@ -0,0 +1,358 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include +#include "/usr/include/valgrind/callgrind.h" + +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" +#include "util/histogram.h" +#include "util/stop_watch.h" +#include "util/testharness.h" + + +bool FLAGS_random_key = false; +bool FLAGS_use_set_based_memetable = false; +int FLAGS_total_keys = 100; +int FLAGS_write_buffer_size = 1000000000; +int FLAGS_max_write_buffer_number = 8; +int FLAGS_min_write_buffer_number_to_merge = 7; + +// Path to the database on file system +const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test"; + +namespace rocksdb { + +std::shared_ptr OpenDb() { + DB* db; + Options options; + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + if (FLAGS_use_set_based_memetable) { + auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0); + options.memtable_factory.reset( + NewHashSkipListRepFactory(prefix_extractor)); + } + + Status s = DB::Open(options, kDbName, &db); + ASSERT_OK(s); + return std::shared_ptr(db); +} + +class PerfContextTest { }; + +TEST(PerfContextTest, SeekIntoDeletion) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + db->Put(write_options, key, value); + } + + for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { + std::string key = "k" + std::to_string(i); + db->Delete(write_options, key); + } + + HistogramImpl hist_get; + HistogramImpl hist_get_time; + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { + std::string key = "k" + std::to_string(i); + std::string value; + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + auto status = db->Get(read_options, key, &value); + auto elapsed_nanos = timer.ElapsedNanos(); + ASSERT_TRUE(status.IsNotFound()); + hist_get.Add(perf_context.user_key_comparison_count); + hist_get_time.Add(elapsed_nanos); + } + + std::cout << "Get uesr key comparison: \n" << hist_get.ToString() + << "Get time: \n" << hist_get_time.ToString(); + + HistogramImpl hist_seek_to_first; + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->SeekToFirst(); + hist_seek_to_first.Add(perf_context.user_key_comparison_count); + auto elapsed_nanos = timer.ElapsedNanos(); + + std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString() + << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n" + << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n" + << "elapsed: " << elapsed_nanos << "\n"; + + HistogramImpl hist_seek; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::unique_ptr iter(db->NewIterator(read_options)); + std::string key = "k" + std::to_string(i); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + auto elapsed_nanos = timer.ElapsedNanos(); + hist_seek.Add(perf_context.user_key_comparison_count); + std::cout << "seek cmp: " << perf_context.user_key_comparison_count + << " ikey skipped " << perf_context.internal_key_skipped_count + << " idelete skipped " << perf_context.internal_delete_skipped_count + << " elapsed: " << elapsed_nanos << "ns\n"; + + perf_context.Reset(); + ASSERT_TRUE(iter->Valid()); + StopWatchNano timer2(Env::Default(), true); + iter->Next(); + auto elapsed_nanos2 = timer2.ElapsedNanos(); + std::cout << "next cmp: " << perf_context.user_key_comparison_count + << "elapsed: " << elapsed_nanos2 << "ns\n"; + } + + std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString(); +} + +TEST(PerfContextTest, StopWatchNanoOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatchNano timer(Env::Default(), true); + for (auto& timing : timings) { + timing = timer.ElapsedNanos(true /* reset */); + } + + HistogramImpl histogram; + for (const auto timing : timings) { + histogram.Add(timing); + } + + std::cout << histogram.ToString(); +} + +TEST(PerfContextTest, StopWatchOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatch timer(Env::Default()); + for (auto& timing : timings) { + timing = timer.ElapsedMicros(); + } + + HistogramImpl histogram; + uint64_t prev_timing = 0; + for (const auto timing : timings) { + histogram.Add(timing - prev_timing); + prev_timing = timing; + } + + std::cout << histogram.ToString(); +} + +void ProfileKeyComparison() { + DestroyDB(kDbName, Options()); // Start this test with a fresh DB + + auto db = OpenDb(); + + WriteOptions write_options; + ReadOptions read_options; + + HistogramImpl hist_put; + HistogramImpl hist_get; + HistogramImpl hist_get_snapshot; + HistogramImpl hist_get_memtable; + HistogramImpl hist_get_post_process; + HistogramImpl hist_num_memtable_checked; + HistogramImpl hist_write_pre_post; + HistogramImpl hist_write_wal_time; + HistogramImpl hist_write_memtable_time; + + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + perf_context.Reset(); + db->Put(write_options, key, value); + hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time); + hist_write_wal_time.Add(perf_context.write_wal_time); + hist_write_memtable_time.Add(perf_context.write_memtable_time); + hist_put.Add(perf_context.user_key_comparison_count); + + perf_context.Reset(); + db->Get(read_options, key, &value); + hist_get_snapshot.Add(perf_context.get_snapshot_time); + hist_get_memtable.Add(perf_context.get_from_memtable_time); + hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_get_post_process.Add(perf_context.get_post_process_time); + hist_get.Add(perf_context.user_key_comparison_count); + } + + std::cout << "Put uesr key comparison: \n" << hist_put.ToString() + << "Get uesr key comparison: \n" << hist_get.ToString(); + std::cout << "Put(): Pre and Post Process Time: \n" + << hist_write_pre_post.ToString() + << " Writing WAL time: \n" + << hist_write_wal_time.ToString() << "\n" + << " Writing Mem Table time: \n" + << hist_write_memtable_time.ToString() << "\n"; + + std::cout << "Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; +} + +TEST(PerfContextTest, KeyComparisonCount) { + SetPerfLevel(kEnableCount); + ProfileKeyComparison(); + + SetPerfLevel(kDisable); + ProfileKeyComparison(); + + SetPerfLevel(kEnableTime); + ProfileKeyComparison(); +} + +// make perf_context_test +// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison +// For one memtable: +// ./perf_context_test --write_buffer_size=500000 --total_keys=10000 +// For two memtables: +// ./perf_context_test --write_buffer_size=250000 --total_keys=10000 +// Specify --random_key=1 to shuffle the key before insertion +// Results show that, for sequential insertion, worst-case Seek Key comparison +// is close to the total number of keys (linear), when there is only one +// memtable. When there are two memtables, even the avg Seek Key comparison +// starts to become linear to the input size. + +TEST(PerfContextTest, SeekKeyComparison) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_wal_time; + HistogramImpl hist_time_diff; + + SetPerfLevel(kEnableTime); + StopWatchNano timer(Env::Default()); + for (const int i : keys) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + perf_context.Reset(); + timer.Start(); + db->Put(write_options, key, value); + auto put_time = timer.ElapsedNanos(); + hist_put_time.Add(put_time); + hist_wal_time.Add(perf_context.write_wal_time); + hist_time_diff.Add(put_time - perf_context.write_wal_time); + } + + std::cout << "Put time:\n" << hist_put_time.ToString() + << "WAL time:\n" << hist_wal_time.ToString() + << "time diff:\n" << hist_time_diff.ToString(); + + HistogramImpl hist_seek; + HistogramImpl hist_next; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + std::to_string(i); + std::string value = "v" + std::to_string(i); + + std::unique_ptr iter(db->NewIterator(read_options)); + perf_context.Reset(); + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), value); + hist_seek.Add(perf_context.user_key_comparison_count); + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (iter->SeekToFirst(); iter->Valid();) { + perf_context.Reset(); + iter->Next(); + hist_next.Add(perf_context.user_key_comparison_count); + } + + std::cout << "Seek:\n" << hist_seek.ToString() + << "Next:\n" << hist_next.ToString(); +} + +} + +int main(int argc, char** argv) { + + for (int i = 1; i < argc; i++) { + int n; + char junk; + + if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } + + if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) { + FLAGS_total_keys = n; + } + + if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_random_key = n; + } + + if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_set_based_memetable = n; + } + + } + + std::cout << kDbName << "\n"; + + rocksdb::test::RunAllTests(); + return 0; +} diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc new file mode 100644 index 00000000..6a95a258 --- /dev/null +++ b/db/plain_table_db_test.cc @@ -0,0 +1,822 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/plain_table_reader.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace rocksdb { + +class PlainTableDBTest { + protected: + private: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + public: + PlainTableDBTest() : env_(Env::Default()) { + ro_.prefix_seek = true; + dbname_ = test::TmpDir() + "/plain_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~PlainTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + ReadOptions ro_; + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(NewPlainTableFactory(16, 2, 0.8, 3)); + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.allow_mmap_reads = true; + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options = ro_; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(PlainTableDBTest, Empty) { + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +class TestPlainTableReader : public PlainTableReader { + public: + TestPlainTableReader(const EnvOptions& storage_options, + const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties, + unique_ptr&& file, + const Options& options, bool* expect_bloom_not_match) + : PlainTableReader(options, std::move(file), storage_options, icomparator, + file_size, bloom_bits_per_key, hash_table_ratio, + index_sparseness, table_properties), + expect_bloom_not_match_(expect_bloom_not_match) { + Status s = PopulateIndex(); + ASSERT_TRUE(s.ok()); + } + + virtual ~TestPlainTableReader() {} + + private: + virtual bool MatchBloom(uint32_t hash) const override { + bool ret = PlainTableReader::MatchBloom(hash); + ASSERT_TRUE(!*expect_bloom_not_match_ || !ret); + return ret; + } + bool* expect_bloom_not_match_; +}; + +extern const uint64_t kPlainTableMagicNumber; +class TestPlainTableFactory : public PlainTableFactory { + public: + explicit TestPlainTableFactory(bool* expect_bloom_not_match, + uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) + : PlainTableFactory(user_key_len, user_key_len, hash_table_ratio, + hash_table_ratio), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness), + expect_bloom_not_match_(expect_bloom_not_match) {} + + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override { + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), &props); + ASSERT_TRUE(s.ok()); + + std::unique_ptr new_reader(new TestPlainTableReader( + soptions, internal_comparator, file_size, bloom_bits_per_key_, + hash_table_ratio_, index_sparseness_, props, std::move(file), options, + expect_bloom_not_match_)); + + *table = std::move(new_reader); + return s; + } + + private: + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + bool* expect_bloom_not_match_; +}; + +TEST(PlainTableDBTest, Flush) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.table_factory.reset( + NewTotalOrderPlainTableFactory(16, bloom_bits, 2)); + } else { + options.table_factory.reset(NewPlainTableFactory(16, bloom_bits)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } + } +} + +TEST(PlainTableDBTest, Flush2) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } + } + } +} + +TEST(PlainTableDBTest, Iterator) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, 16, bloom_bits, 0, 2)); + } else { + options.table_factory.reset( + new TestPlainTableFactory(&expect_bloom_not_match, 16, bloom_bits)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + if (!total_order) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } else { + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } + } + + delete iter; + } + } +} + +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} + +TEST(PlainTableDBTest, IteratorLargeKeys) { + Options options = CurrentOptions(); + options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16)); + options.create_if_missing = true; + options.prefix_extractor.reset(); + DestroyAndReopen(&options); + + std::string key_list[] = { + MakeLongKey(30, '0'), + MakeLongKey(16, '1'), + MakeLongKey(32, '2'), + MakeLongKey(60, '3'), + MakeLongKey(90, '4'), + MakeLongKey(50, '5'), + MakeLongKey(26, '6') + }; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], std::to_string(i))); + } + + dbfull()->TEST_FlushMemTable(); + + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(std::to_string(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + + virtual const char* Name() const { return "SimpleSuffixReverseComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + +TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + + delete iter; +} + +TEST(PlainTableDBTest, HashBucketConflict) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } +} + +TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 2 ^ i)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } +} + +TEST(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + options.table_factory.reset(NewTotalOrderPlainTableFactory(16, 0, 5)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v3")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("5000000000000fo2")); + + ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + + Iterator* iter = dbfull()->NewIterator(ro_); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(PlainTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/prefix_filter_iterator.h b/db/prefix_filter_iterator.h new file mode 100644 index 00000000..e868c7a5 --- /dev/null +++ b/db/prefix_filter_iterator.h @@ -0,0 +1,75 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Wrap an underlying iterator, but exclude any results not starting +// with a given prefix. Seeking to keys not beginning with the prefix +// is invalid, and SeekToLast is not implemented (that would be +// non-trivial), but otherwise this iterator will behave just like the +// underlying iterator would if there happened to be no non-matching +// keys in the dataset. + +#pragma once +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +class PrefixFilterIterator : public Iterator { + private: + Iterator* iter_; + const Slice &prefix_; + const SliceTransform *prefix_extractor_; + Status status_; + + public: + PrefixFilterIterator(Iterator* iter, + const Slice &prefix, + const SliceTransform* prefix_extractor) + : iter_(iter), prefix_(prefix), + prefix_extractor_(prefix_extractor), + status_(Status::OK()) { + if (prefix_extractor == nullptr) { + status_ = Status::InvalidArgument("A prefix filter may not be used " + "unless a function is also defined " + "for extracting prefixes"); + } else if (!prefix_extractor_->InRange(prefix)) { + status_ = Status::InvalidArgument("Must provide a slice for prefix which" + "is a prefix for some key"); + } + } + ~PrefixFilterIterator() { + delete iter_; + } + Slice key() const { return iter_->key(); } + Slice value() const { return iter_->value(); } + Status status() const { + if (!status_.ok()) { + return status_; + } + return iter_->status(); + } + void Next() { iter_->Next(); } + void Prev() { iter_->Prev(); } + void Seek(const Slice& k) { + if (prefix_extractor_->Transform(k) == prefix_) { + iter_->Seek(k); + } else { + status_ = Status::InvalidArgument("Seek must begin with target prefix"); + } + } + void SeekToFirst() { + Seek(prefix_); + } + void SeekToLast() { + status_ = Status::NotSupported("SeekToLast is incompatible with prefixes"); + } + bool Valid() const { + return (status_.ok() && iter_->Valid() && + prefix_extractor_->Transform(iter_->key()) == prefix_); + } +}; + +} // namespace rocksdb diff --git a/db/prefix_test.cc b/db/prefix_test.cc new file mode 100644 index 00000000..89e31b60 --- /dev/null +++ b/db/prefix_test.cc @@ -0,0 +1,628 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" +#include "util/histogram.h" +#include "util/stop_watch.h" +#include "util/testharness.h" + +DEFINE_bool(use_prefix_hash_memtable, true, ""); +DEFINE_bool(trigger_deadlock, false, + "issue delete in range scan to trigger PrefixHashMap deadlock"); +DEFINE_uint64(bucket_count, 100000, "number of buckets"); +DEFINE_uint64(num_locks, 10001, "number of locks"); +DEFINE_bool(random_prefix, false, "randomize prefix"); +DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); +DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); +DEFINE_int64(write_buffer_size, 33554432, ""); +DEFINE_int64(max_write_buffer_number, 2, ""); +DEFINE_int64(min_write_buffer_number_to_merge, 1, ""); +DEFINE_int32(skiplist_height, 4, ""); +DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); +DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); +DEFINE_int32(value_size, 40, ""); + +// Path to the database on file system +const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test"; + +namespace rocksdb { + +struct TestKey { + uint64_t prefix; + uint64_t sorted; + + TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {} +}; + +// return a slice backed by test_key +inline Slice TestKeyToSlice(const TestKey& test_key) { + return Slice((const char*)&test_key, sizeof(test_key)); +} + +inline const TestKey* SliceToTestKey(const Slice& slice) { + return (const TestKey*)slice.data(); +} + +class TestKeyComparator : public Comparator { + public: + + // Compare needs to be aware of the possibility of a and/or b is + // prefix only + virtual int Compare(const Slice& a, const Slice& b) const { + const TestKey* key_a = SliceToTestKey(a); + const TestKey* key_b = SliceToTestKey(b); + if (key_a->prefix != key_b->prefix) { + if (key_a->prefix < key_b->prefix) return -1; + if (key_a->prefix > key_b->prefix) return 1; + } else { + ASSERT_TRUE(key_a->prefix == key_b->prefix); + // note, both a and b could be prefix only + if (a.size() != b.size()) { + // one of them is prefix + ASSERT_TRUE( + (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) || + (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey))); + if (a.size() < b.size()) return -1; + if (a.size() > b.size()) return 1; + } else { + // both a and b are prefix + if (a.size() == sizeof(uint64_t)) { + return 0; + } + + // both a and b are whole key + ASSERT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey)); + if (key_a->sorted < key_b->sorted) return -1; + if (key_a->sorted > key_b->sorted) return 1; + if (key_a->sorted == key_b->sorted) return 0; + } + } + return 0; + } + + virtual const char* Name() const override { + return "TestKeyComparator"; + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + } + + virtual void FindShortSuccessor(std::string* key) const {} + +}; + +void PutKey(DB* db, WriteOptions write_options, uint64_t prefix, + uint64_t suffix, const Slice& value) { + TestKey test_key(prefix, suffix); + Slice key = TestKeyToSlice(test_key); + ASSERT_OK(db->Put(write_options, key, value)); +} + +void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) { + TestKey test_key(prefix, suffix); + Slice key = TestKeyToSlice(test_key); + iter->Seek(key); +} + +const std::string kNotFoundResult = "NOT_FOUND"; + +std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix, + uint64_t suffix) { + TestKey test_key(prefix, suffix); + Slice key = TestKeyToSlice(test_key); + + std::string result; + Status s = db->Get(read_options, key, &result); + if (s.IsNotFound()) { + result = kNotFoundResult; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +class PrefixTest { + public: + std::shared_ptr OpenDb() { + DB* db; + + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits; + options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes; + + Status s = DB::Open(options, kDbName, &db); + ASSERT_OK(s); + return std::shared_ptr(db); + } + + void FirstOption() { + option_config_ = kBegin; + } + + bool NextOptions(int bucket_count) { + // skip some options + option_config_++; + if (option_config_ < kEnd) { + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + switch(option_config_) { + case kHashSkipList: + options.memtable_factory.reset( + NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height)); + return true; + case kHashLinkList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count)); + return true; + default: + return false; + } + } + return false; + } + + PrefixTest() : option_config_(kBegin) { + options.comparator = new TestKeyComparator(); + } + ~PrefixTest() { + delete options.comparator; + } + protected: + enum OptionConfig { + kBegin, + kHashSkipList, + kHashLinkList, + kEnd + }; + int option_config_; + Options options; +}; + +TEST(PrefixTest, TestResult) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + read_options.prefix_seek = true; + + // 1. Insert one row. + Slice v16("v16"); + PutKey(db.get(), write_options, 1, 6, v16); + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6)); + + // 2. Insert an entry for the same prefix as the last entry in the bucket. + Slice v17("v17"); + PutKey(db.get(), write_options, 1, 7, v17); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + + // 3. Insert an entry for the same prefix as the head of the bucket. + Slice v15("v15"); + PutKey(db.get(), write_options, 1, 5, v15); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + + // 4. Insert an entry with a larger prefix + Slice v22("v22"); + PutKey(db.get(), write_options, 2, 2, v22); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 2, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 5. Insert an entry with a smaller prefix + Slice v02("v02"); + PutKey(db.get(), write_options, 0, 2, v02); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 0, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 6. Insert to the beginning and the end of the first prefix + Slice v13("v13"); + Slice v18("v18"); + PutKey(db.get(), write_options, 1, 3, v13); + PutKey(db.get(), write_options, 1, 8, v18); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 3); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v13 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2)); + ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2)); + ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3)); + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8)); + } + } +} + +TEST(PrefixTest, FullIterator) { + while (NextOptions(1000000)) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + + std::vector prefixes; + for (uint64_t i = 0; i < 100; ++i) { + prefixes.push_back(i); + } + std::random_shuffle(prefixes.begin(), prefixes.end()); + + for (auto prefix : prefixes) { + for (uint64_t i = 0; i < 200; ++i) { + TestKey test_key(prefix, i); + Slice key = TestKeyToSlice(test_key); + ASSERT_OK(db->Put(write_options, key, Slice("0"))); + } + } + + auto func = [](void* db_void) { + auto db = reinterpret_cast(db_void); + std::unique_ptr iter(db->NewIterator(ReadOptions())); + iter->SeekToFirst(); + for (int i = 0; i < 3; ++i) { + iter->Next(); + } + }; + + auto env = Env::Default(); + for (int i = 0; i < 16; ++i) { + env->StartThread(func, reinterpret_cast(db.get())); + } + env->WaitForJoin(); + } +} + +TEST(PrefixTest, DynamicPrefixIterator) { + while (NextOptions(FLAGS_bucket_count)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; + + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + Slice key = TestKeyToSlice(test_key); + std::string value(FLAGS_value_size, 0); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(perf_context.user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + if (FLAGS_use_prefix_hash_memtable) { + read_options.prefix_seek = true; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 10000; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); + } +} + +TEST(PrefixTest, PrefixHash) { + while (NextOptions(FLAGS_bucket_count)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + // insert x random prefix, each with y continuous element. + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; + + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(sorted); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(perf_context.user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); + + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + for (auto prefix : prefixes) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + std::string value = "v" + std::to_string(0); + + Slice key_prefix; + if (FLAGS_use_prefix_hash_memtable) { + key_prefix = options.prefix_extractor->Transform(key); + read_options.prefix = &key_prefix; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + uint64_t total_keys = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + auto test_key = SliceToTestKey(iter->key()); + if (test_key->prefix != prefix) break; + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 100; + prefix++) { + TestKey test_key(prefix, 0); + Slice key = TestKeyToSlice(test_key); + + if (FLAGS_use_prefix_hash_memtable) { + Slice key_prefix = options.prefix_extractor->Transform(key); + read_options.prefix = &key_prefix; + } + std::unique_ptr iter(db->NewIterator(read_options)); + + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(perf_context.user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); + } +} + +} + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + std::cout << kDbName << "\n"; + + rocksdb::test::RunAllTests(); + return 0; +} diff --git a/db/repair.cc b/db/repair.cc new file mode 100644 index 00000000..f3b95f5e --- /dev/null +++ b/db/repair.cc @@ -0,0 +1,389 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// We recover the contents of the descriptor from the other files we find. +// (1) Any log files are first converted to tables +// (2) We scan every table to compute +// (a) smallest/largest for the table +// (b) largest sequence number in the table +// (3) We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#include "db/builder.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const Options& options) + : dbname_(dbname), + env_(options.env), + icmp_(options.comparator), + ipolicy_(options.filter_policy), + options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), + next_file_number_(1) { + // TableCache can be small since we expect each table to be opened once. + table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); + edit_ = new VersionEdit(); + } + + ~Repairer() { + delete table_cache_; + delete edit_; + } + + Status Run() { + Status status = FindFiles(); + if (status.ok()) { + ConvertLogFilesToTables(); + ExtractMetaData(); + status = WriteDescriptor(); + } + if (status.ok()) { + unsigned long long bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.file_size; + } + Log(options_.info_log, + "**** Repaired rocksdb %s; " + "recovered %d files; %llu bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), + static_cast(tables_.size()), + bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + SequenceNumber min_sequence; + SequenceNumber max_sequence; + }; + + std::string const dbname_; + Env* const env_; + InternalKeyComparator const icmp_; + InternalFilterPolicy const ipolicy_; + Options const options_; + TableCache* table_cache_; + VersionEdit* edit_; + + std::vector manifests_; + std::vector table_numbers_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + const EnvOptions storage_options_; + + Status FindFiles() { + std::vector filenames; + Status status = env_->GetChildren(dbname_, &filenames); + if (!status.ok()) { + return status; + } + if (filenames.empty()) { + return Status::Corruption(dbname_, "repair found no files"); + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_numbers_.push_back(number); + } else { + // Ignore other files + } + } + } + } + return status; + } + + void ConvertLogFilesToTables() { + for (size_t i = 0; i < logs_.size(); i++) { + std::string logname = LogFileName(dbname_, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", + (unsigned long long) logs_[i], + status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + std::shared_ptr info_log; + uint64_t lognum; + virtual void Corruption(size_t bytes, const Status& s) { + // We print error messages for corruption, but continue repairing. + Log(info_log, "Log #%llu: dropping %d bytes; %s", + (unsigned long long) lognum, + static_cast(bytes), + s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(dbname_, log); + unique_ptr lfile; + Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); + if (!status.ok()) { + return status; + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = options_.info_log; + reporter.lognum = log; + // We intentially make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/, + 0/*initial_offset*/); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + MemTable* mem = new MemTable(icmp_, options_); + mem->Ref(); + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::InsertInto(&batch, mem, &options_); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + Log(options_.info_log, "Log #%llu: ignoring %s", + (unsigned long long) log, + status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + FileMetaData meta; + meta.number = next_file_number_++; + Iterator* iter = mem->NewIterator(); + status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, + iter, &meta, icmp_, 0, 0, kNoCompression); + delete iter; + delete mem->Unref(); + mem = nullptr; + if (status.ok()) { + if (meta.file_size > 0) { + table_numbers_.push_back(meta.number); + } + } + Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", + (unsigned long long) log, + counter, + (unsigned long long) meta.number, + status.ToString().c_str()); + return status; + } + + void ExtractMetaData() { + for (size_t i = 0; i < table_numbers_.size(); i++) { + TableInfo t; + t.meta.number = table_numbers_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName(dbname_, table_numbers_[i]); + Log(options_.info_log, "Table #%llu: ignoring %s", + (unsigned long long) table_numbers_[i], + status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName(dbname_, t->meta.number); + int counter = 0; + Status status = env_->GetFileSize(fname, &t->meta.file_size); + if (status.ok()) { + FileMetaData dummy_meta(t->meta.number, t->meta.file_size); + Iterator* iter = table_cache_->NewIterator( + ReadOptions(), storage_options_, icmp_, dummy_meta); + bool empty = true; + ParsedInternalKey parsed; + t->min_sequence = 0; + t->max_sequence = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + Log(options_.info_log, "Table #%llu: unparsable key %s", + (unsigned long long) t->meta.number, + EscapeString(key).c_str()); + continue; + } + + counter++; + if (empty) { + empty = false; + t->meta.smallest.DecodeFrom(key); + } + t->meta.largest.DecodeFrom(key); + if (parsed.sequence < t->min_sequence) { + t->min_sequence = parsed.sequence; + } + if (parsed.sequence > t->max_sequence) { + t->max_sequence = parsed.sequence; + } + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + } + Log(options_.info_log, "Table #%llu: %d entries %s", + (unsigned long long) t->meta.number, + counter, + status.ToString().c_str()); + return status; + } + + Status WriteDescriptor() { + std::string tmp = TempFileName(dbname_, 1); + unique_ptr file; + Status status = env_->NewWritableFile( + tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); + if (!status.ok()) { + return status; + } + + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + if (max_sequence < tables_[i].max_sequence) { + max_sequence = tables_[i].max_sequence; + } + } + + edit_->SetComparatorName(icmp_.user_comparator()->Name()); + edit_->SetLogNumber(0); + edit_->SetNextFile(next_file_number_); + edit_->SetLastSequence(max_sequence); + + for (size_t i = 0; i < tables_.size(); i++) { + // TODO(opt): separate out into multiple levels + const TableInfo& t = tables_[i]; + edit_->AddFile(0, t.meta.number, t.meta.file_size, + t.meta.smallest, t.meta.largest, + t.min_sequence, t.max_sequence); + } + + //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); + { + log::Writer log(std::move(file)); + std::string record; + edit_->EncodeTo(&record); + status = log.AddRecord(record); + } + + if (!status.ok()) { + env_->DeleteFile(tmp); + } else { + // Discard older manifests + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + + // Install new manifest + status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); + if (status.ok()) { + status = SetCurrentFile(env_, dbname_, 1); + } else { + env_->DeleteFile(tmp); + } + } + return status; + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != nullptr) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + Log(options_.info_log, "Archiving %s: %s\n", + fname.c_str(), s.ToString().c_str()); + } +}; +} // namespace + +Status RepairDB(const std::string& dbname, const Options& options) { + Repairer repairer(dbname, options); + return repairer.Run(); +} + +} // namespace rocksdb diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc new file mode 100644 index 00000000..a6711466 --- /dev/null +++ b/db/simple_table_db_test.cc @@ -0,0 +1,800 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_builder.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +// IS THIS FILE STILL NEEDED? +namespace rocksdb { + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +// SimpleTable requires the input key size to be fixed 16 bytes, value cannot +// be longer than 150000 bytes and stored data on disk in this format: +// +--------------------------------------------+ <= key1 offset +// | key1 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | key2 | value_size (4 bytes) | | +// +----------------------------------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ <= index_block_offset +// | key1 | key1 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key2 | key2 offset (8 bytes) | +// +-----------------+--------------------------+ +// | key3 | key3 offset (8 bytes) | +// +-----------------+--------------------------+ +// | ...... | +// +-----------------+------------+-------------+ +// | index_block_offset (8 bytes) | +// +------------------------------+ + +// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. +class SimpleTableReader: public TableReader { +public: + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table" to the newly opened + // table. The client should delete "*table" when no longer needed. + // If there was an error while initializing the table, sets "*table" + // to nullptr and returns a non-ok status. Does not take ownership of + // "*source", but the client must ensure that "source" remains live + // for the duration of the returned table's lifetime. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& options, const EnvOptions& soptions, + unique_ptr && file, uint64_t file_size, + unique_ptr* table_reader); + + bool PrefixMayMatch(const Slice& internal_prefix) override; + + Iterator* NewIterator(const ReadOptions&) override; + + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*handle_result)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override; + + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override; + + ~SimpleTableReader(); + +private: + struct Rep; + Rep* rep_; + + explicit SimpleTableReader(Rep* rep) { + rep_ = rep; + } + friend class TableCache; + friend class SimpleTableIterator; + + Status GetOffset(const Slice& target, uint64_t* offset); + + // No copying allowed + explicit SimpleTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +// Iterator to iterate SimpleTable +class SimpleTableIterator: public Iterator { +public: + explicit SimpleTableIterator(SimpleTableReader* table); + ~SimpleTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + +private: + SimpleTableReader* table_; + uint64_t offset_; + uint64_t next_offset_; + Slice key_; + Slice value_; + char tmp_str_[4]; + char* key_str_; + char* value_str_; + int value_str_len_; + Status status_; + // No copying allowed + SimpleTableIterator(const SimpleTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +struct SimpleTableReader::Rep { + ~Rep() { + } + Rep(const EnvOptions& storage_options, uint64_t index_start_offset, + int num_entries) : + soptions(storage_options), index_start_offset(index_start_offset), + num_entries(num_entries) { + } + + Options options; + const EnvOptions& soptions; + Status status; + unique_ptr file; + uint64_t index_start_offset; + int num_entries; + std::shared_ptr table_properties; + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } +}; + +SimpleTableReader::~SimpleTableReader() { + delete rep_; +} + +Status SimpleTableReader::Open(const Options& options, + const EnvOptions& soptions, + unique_ptr && file, + uint64_t size, + unique_ptr* table_reader) { + char footer_space[Rep::offset_length]; + Slice footer_input; + Status s = file->Read(size - Rep::offset_length, Rep::offset_length, + &footer_input, footer_space); + if (s.ok()) { + uint64_t index_start_offset = DecodeFixed64(footer_space); + + int num_entries = (size - Rep::offset_length - index_start_offset) + / (Rep::GetInternalKeyLength() + Rep::offset_length); + SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, + index_start_offset, + num_entries); + + rep->file = std::move(file); + rep->options = options; + table_reader->reset(new SimpleTableReader(rep)); + } + return s; +} + +void SimpleTableReader::SetupForCompaction() { +} + +std::shared_ptr SimpleTableReader::GetTableProperties() + const { + return rep_->table_properties; +} + +bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) { + return true; +} + +Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) { + return new SimpleTableIterator(this); +} + +Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { + uint32_t left = 0; + uint32_t right = rep_->num_entries - 1; + char key_chars[Rep::GetInternalKeyLength()]; + Slice tmp_slice; + + uint32_t target_offset = 0; + while (left <= right) { + uint32_t mid = (left + right + 1) / 2; + + uint64_t offset_to_read = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; + Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), + &tmp_slice, key_chars); + if (!s.ok()) { + return s; + } + + InternalKeyComparator ikc(rep_->options.comparator); + int compare_result = ikc.Compare(tmp_slice, target); + + if (compare_result < 0) { + if (left == right) { + target_offset = right + 1; + break; + } + left = mid; + } else { + if (left == right) { + target_offset = left; + break; + } + right = mid - 1; + } + } + + if (target_offset >= (uint32_t) rep_->num_entries) { + *offset = rep_->index_start_offset; + return Status::OK(); + } + + char value_offset_chars[Rep::offset_length]; + + int64_t offset_for_value_offset = rep_->index_start_offset + + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset + + Rep::GetInternalKeyLength(); + Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, + &tmp_slice, value_offset_chars); + if (s.ok()) { + *offset = DecodeFixed64(value_offset_chars); + } + return s; +} + +Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + Status s; + SimpleTableIterator* iter = new SimpleTableIterator(this); + for (iter->Seek(k); iter->Valid(); iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + return Status::Corruption(Slice()); + } + + if (!(*saver)(arg, parsed_key, iter->value(), true)) { + break; + } + } + s = iter->status(); + delete iter; + return s; +} + +uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) : + table_(table) { + key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()]; + value_str_len_ = -1; + SeekToFirst(); +} + +SimpleTableIterator::~SimpleTableIterator() { + delete[] key_str_; + if (value_str_len_ >= 0) { + delete[] value_str_; + } +} + +bool SimpleTableIterator::Valid() const { + return offset_ < table_->rep_->index_start_offset; +} + +void SimpleTableIterator::SeekToFirst() { + next_offset_ = 0; + Next(); +} + +void SimpleTableIterator::SeekToLast() { + assert(false); +} + +void SimpleTableIterator::Seek(const Slice& target) { + Status s = table_->GetOffset(target, &next_offset_); + if (!s.ok()) { + status_ = s; + } + Next(); +} + +void SimpleTableIterator::Next() { + offset_ = next_offset_; + if (offset_ >= table_->rep_->index_start_offset) { + return; + } + Slice result; + int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength(); + + Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, + key_str_); + next_offset_ += internal_key_size; + key_ = result; + + Slice value_size_slice; + s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); + next_offset_ += 4; + uint32_t value_size = DecodeFixed32(tmp_str_); + + Slice value_slice; + if ((int) value_size > value_str_len_) { + if (value_str_len_ >= 0) { + delete[] value_str_; + } + value_str_ = new char[value_size]; + value_str_len_ = value_size; + } + s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, + value_str_); + next_offset_ += value_size; + value_ = value_slice; +} + +void SimpleTableIterator::Prev() { + assert(false); +} + +Slice SimpleTableIterator::key() const { + Log(table_->rep_->options.info_log, "key!!!!"); + return key_; +} + +Slice SimpleTableIterator::value() const { + return value_; +} + +Status SimpleTableIterator::status() const { + return status_; +} + +class SimpleTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + SimpleTableBuilder(const Options& options, WritableFile* file, + CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~SimpleTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + struct Rep; + Rep* rep_; + + // No copying allowed + SimpleTableBuilder(const SimpleTableBuilder&) = delete; + void operator=(const SimpleTableBuilder&) = delete; +}; + +struct SimpleTableBuilder::Rep { + Options options; + WritableFile* file; + uint64_t offset = 0; + Status status; + + uint64_t num_entries = 0; + + bool closed = false; // Either Finish() or Abandon() has been called. + + const static int user_key_size = 16; + const static int offset_length = 8; + const static int key_footer_len = 8; + + static int GetInternalKeyLength() { + return user_key_size + key_footer_len; + } + + std::string index; + + Rep(const Options& opt, WritableFile* f) : + options(opt), file(f) { + } + ~Rep() { + } +}; + +SimpleTableBuilder::SimpleTableBuilder(const Options& options, + WritableFile* file, + CompressionType compression_type) : + rep_(new SimpleTableBuilder::Rep(options, file)) { +} + +SimpleTableBuilder::~SimpleTableBuilder() { + delete (rep_); +} + +void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { + assert((int ) key.size() == Rep::GetInternalKeyLength()); + + // Update index + rep_->index.append(key.data(), key.size()); + PutFixed64(&(rep_->index), rep_->offset); + + // Write key-value pair + rep_->file->Append(key); + rep_->offset += Rep::GetInternalKeyLength(); + + std::string size; + int value_size = value.size(); + PutFixed32(&size, value_size); + Slice sizeSlice(size); + rep_->file->Append(sizeSlice); + rep_->file->Append(value); + rep_->offset += value_size + 4; + + rep_->num_entries++; +} + +Status SimpleTableBuilder::status() const { + return Status::OK(); +} + +Status SimpleTableBuilder::Finish() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; + + uint64_t index_offset = rep_->offset; + Slice index_slice(rep_->index); + rep_->file->Append(index_slice); + rep_->offset += index_slice.size(); + + std::string index_offset_str; + PutFixed64(&index_offset_str, index_offset); + Slice foot_slice(index_offset_str); + rep_->file->Append(foot_slice); + rep_->offset += foot_slice.size(); + + return Status::OK(); +} + +void SimpleTableBuilder::Abandon() { + rep_->closed = true; +} + +uint64_t SimpleTableBuilder::NumEntries() const { + return rep_->num_entries; +} + +uint64_t SimpleTableBuilder::FileSize() const { + return rep_->offset; +} + +class SimpleTableFactory: public TableFactory { +public: + ~SimpleTableFactory() { + } + SimpleTableFactory() { + } + const char* Name() const override { + return "SimpleTable"; + } + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const; + + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& internal_key, + WritableFile* file, + CompressionType compression_type) const; +}; + +Status SimpleTableFactory::NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const { + + return SimpleTableReader::Open(options, soptions, std::move(file), file_size, + table_reader); +} + +TableBuilder* SimpleTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_key, + WritableFile* file, CompressionType compression_type) const { + return new SimpleTableBuilder(options, file, compression_type); +} + +class SimpleTableDBTest { +protected: +public: + std::string dbname_; + Env* env_; + DB* db_; + + Options last_options_; + + SimpleTableDBTest() : + env_(Env::Default()) { + dbname_ = test::TmpDir() + "/simple_table_db_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~SimpleTableDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, Options())); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + options.table_factory.reset(new SimpleTableFactory()); + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST(SimpleTableDBTest, Empty) { + ASSERT_TRUE(db_ != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +TEST(SimpleTableDBTest, ReadWrite) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_EQ("v1", Get("0000000000000foo")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush) { + ASSERT_OK(Put("0000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("0000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); +} + +TEST(SimpleTableDBTest, Flush2) { + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("0000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("0000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("0000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST(SimpleTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; //100KB + options.num_levels = 3; + options.max_mem_compaction_level = 0; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/skiplist.h b/db/skiplist.h new file mode 100644 index 00000000..751f7c3e --- /dev/null +++ b/db/skiplist.h @@ -0,0 +1,429 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread safety +// ------------- +// +// Writes require external synchronization, most likely a mutex. +// Reads require a guarantee that the SkipList will not be destroyed +// while the read is in progress. Apart from that, reads progress +// without any internal locking or synchronization. +// +// Invariants: +// +// (1) Allocated nodes are never deleted until the SkipList is +// destroyed. This is trivially guaranteed by the code since we +// never delete any skip list nodes. +// +// (2) The contents of a Node except for the next/prev pointers are +// immutable after the Node has been linked into the SkipList. +// Only Insert() modifies the list, and it is careful to initialize +// a node and use release-stores to publish the nodes in one or +// more lists. +// +// ... prev vs. next pointer ordering ... +// + +#pragma once +#include +#include +#include "util/arena.h" +#include "port/port.h" +#include "util/arena.h" +#include "util/random.h" + +namespace rocksdb { + +template +class SkipList { + private: + struct Node; + + public: + // Create a new SkipList object that will use "cmp" for comparing keys, + // and will allocate memory using "*arena". Objects allocated in the arena + // must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Arena* arena, + int32_t max_height = 12, int32_t branching_factor = 4); + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + void Insert(const Key& key); + + // Returns true iff an entry that compares equal to key is in the list. + bool Contains(const Key& key) const; + + // Iteration over the contents of a skip list + class Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(const SkipList* list); + + // Change the underlying skiplist used for this iterator + // This enables us not changing the iterator without deallocating + // an old one and then allocating a new one + void SetList(const SkipList* list); + + // Returns true iff the iterator is positioned at a valid node. + bool Valid() const; + + // Returns the key at the current position. + // REQUIRES: Valid() + const Key& key() const; + + // Advances to the next position. + // REQUIRES: Valid() + void Next(); + + // Advances to the previous position. + // REQUIRES: Valid() + void Prev(); + + // Advance to the first entry with a key >= target + void Seek(const Key& target); + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToFirst(); + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + void SeekToLast(); + + private: + const SkipList* list_; + Node* node_; + // Intentionally copyable + }; + + private: + const int32_t kMaxHeight_; + const int32_t kBranching_; + + // Immutable after construction + Comparator const compare_; + Arena* const arena_; // Arena used for allocations of nodes + + Node* const head_; + + // Modified only by Insert(). Read racily by readers, but stale + // values are ok. + port::AtomicPointer max_height_; // Height of the entire list + + // Used for optimizing sequential insert patterns + Node** prev_; + int32_t prev_height_; + + inline int GetMaxHeight() const { + return static_cast( + reinterpret_cast(max_height_.NoBarrier_Load())); + } + + // Read/written only by Insert(). + Random rnd_; + + Node* NewNode(const Key& key, int height); + int RandomHeight(); + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + // Return true if key is greater than the data stored in "n" + bool KeyIsAfterNode(const Key& key, Node* n) const; + + // Return the earliest node that comes at or after key. + // Return nullptr if there is no such node. + // + // If prev is non-nullptr, fills prev[level] with pointer to previous + // node at "level" for every level in [0..max_height_-1]. + Node* FindGreaterOrEqual(const Key& key, Node** prev) const; + + // Return the latest node with a key < key. + // Return head_ if there is no such node. + Node* FindLessThan(const Key& key) const; + + // Return the last node in the list. + // Return head_ if list is empty. + Node* FindLast() const; + + // No copying allowed + SkipList(const SkipList&); + void operator=(const SkipList&); +}; + +// Implementation details follow +template +struct SkipList::Node { + explicit Node(const Key& k) : key(k) { } + + Key const key; + + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next(int n) { + assert(n >= 0); + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_[n].Acquire_Load()); + } + void SetNext(int n, Node* x) { + assert(n >= 0); + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_[n].Release_Store(x); + } + + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next(int n) { + assert(n >= 0); + return reinterpret_cast(next_[n].NoBarrier_Load()); + } + void NoBarrier_SetNext(int n, Node* x) { + assert(n >= 0); + next_[n].NoBarrier_Store(x); + } + + private: + // Array of length equal to the node height. next_[0] is lowest level link. + port::AtomicPointer next_[1]; +}; + +template +typename SkipList::Node* +SkipList::NewNode(const Key& key, int height) { + char* mem = arena_->AllocateAligned( + sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + return new (mem) Node(key); +} + +template +inline SkipList::Iterator::Iterator(const SkipList* list) { + SetList(list); +} + +template +inline void SkipList::Iterator::SetList(const SkipList* list) { + list_ = list; + node_ = nullptr; +} + +template +inline bool SkipList::Iterator::Valid() const { + return node_ != nullptr; +} + +template +inline const Key& SkipList::Iterator::key() const { + assert(Valid()); + return node_->key; +} + +template +inline void SkipList::Iterator::Next() { + assert(Valid()); + node_ = node_->Next(0); +} + +template +inline void SkipList::Iterator::Prev() { + // Instead of using explicit "prev" links, we just search for the + // last node that falls before key. + assert(Valid()); + node_ = list_->FindLessThan(node_->key); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +inline void SkipList::Iterator::Seek(const Key& target) { + node_ = list_->FindGreaterOrEqual(target, nullptr); +} + +template +inline void SkipList::Iterator::SeekToFirst() { + node_ = list_->head_->Next(0); +} + +template +inline void SkipList::Iterator::SeekToLast() { + node_ = list_->FindLast(); + if (node_ == list_->head_) { + node_ = nullptr; + } +} + +template +int SkipList::RandomHeight() { + // Increase height with probability 1 in kBranching + int height = 1; + while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) { + height++; + } + assert(height > 0); + assert(height <= kMaxHeight_); + return height; +} + +template +bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); +} + +template +typename SkipList::Node* SkipList:: + FindGreaterOrEqual(const Key& key, Node** prev) const { + // Use prev as an optimization hint and fallback to slow path + if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) { + Node* x = prev[0]; + Node* next = x->Next(0); + if ((x == head_) || KeyIsAfterNode(key, x)) { + // Adjust all relevant insertion points to the previous entry + for (int i = 1; i < prev_height_; i++) { + prev[i] = x; + } + return next; + } + } + // Normal lookup + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, next)) { + // Keep searching in this list + x = next; + } else { + if (prev != nullptr) prev[level] = x; + if (level == 0) { + return next; + } else { + // Switch to next list + level--; + } + } + } +} + +template +typename SkipList::Node* +SkipList::FindLessThan(const Key& key) const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + assert(x == head_ || compare_(x->key, key) < 0); + Node* next = x->Next(level); + if (next == nullptr || compare_(next->key, key) >= 0) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +typename SkipList::Node* SkipList::FindLast() + const { + Node* x = head_; + int level = GetMaxHeight() - 1; + while (true) { + Node* next = x->Next(level); + if (next == nullptr) { + if (level == 0) { + return x; + } else { + // Switch to next list + level--; + } + } else { + x = next; + } + } +} + +template +SkipList::SkipList(const Comparator cmp, Arena* arena, + int32_t max_height, + int32_t branching_factor) + : kMaxHeight_(max_height), + kBranching_(branching_factor), + compare_(cmp), + arena_(arena), + head_(NewNode(0 /* any key will do */, max_height)), + max_height_(reinterpret_cast(1)), + prev_height_(1), + rnd_(0xdeadbeef) { + assert(kMaxHeight_ > 0); + assert(kBranching_ > 0); + // Allocate the prev_ Node* array, directly from the passed-in arena. + // prev_ does not need to be freed, as its life cycle is tied up with + // the arena as a whole. + prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_); + for (int i = 0; i < kMaxHeight_; i++) { + head_->SetNext(i, nullptr); + prev_[i] = head_; + } +} + +template +void SkipList::Insert(const Key& key) { + // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() + // here since Insert() is externally synchronized. + Node* x = FindGreaterOrEqual(key, prev_); + + // Our data structure does not allow duplicate insertion + assert(x == nullptr || !Equal(key, x->key)); + + int height = RandomHeight(); + if (height > GetMaxHeight()) { + for (int i = GetMaxHeight(); i < height; i++) { + prev_[i] = head_; + } + //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + + // It is ok to mutate max_height_ without any synchronization + // with concurrent readers. A concurrent reader that observes + // the new value of max_height_ will see either the old value of + // new level pointers from head_ (nullptr), or a new value set in + // the loop below. In the former case the reader will + // immediately drop to the next level since nullptr sorts after all + // keys. In the latter case the reader will use the new node. + max_height_.NoBarrier_Store(reinterpret_cast(height)); + } + + x = NewNode(key, height); + for (int i = 0; i < height; i++) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i)); + prev_[i]->SetNext(i, x); + } + prev_[0] = x; + prev_height_ = height; +} + +template +bool SkipList::Contains(const Key& key) const { + Node* x = FindGreaterOrEqual(key, nullptr); + if (x != nullptr && Equal(key, x->key)) { + return true; + } else { + return false; + } +} + +} // namespace rocksdb diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc new file mode 100644 index 00000000..b87ddcbb --- /dev/null +++ b/db/skiplist_test.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/skiplist.h" +#include +#include "rocksdb/env.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +typedef uint64_t Key; + +struct TestComparator { + int operator()(const Key& a, const Key& b) const { + if (a < b) { + return -1; + } else if (a > b) { + return +1; + } else { + return 0; + } + } +}; + +class SkipTest { }; + +TEST(SkipTest, Empty) { + Arena arena; + TestComparator cmp; + SkipList list(cmp, &arena); + ASSERT_TRUE(!list.Contains(10)); + + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToFirst(); + ASSERT_TRUE(!iter.Valid()); + iter.Seek(100); + ASSERT_TRUE(!iter.Valid()); + iter.SeekToLast(); + ASSERT_TRUE(!iter.Valid()); +} + +TEST(SkipTest, InsertAndLookup) { + const int N = 2000; + const int R = 5000; + Random rnd(1000); + std::set keys; + Arena arena; + TestComparator cmp; + SkipList list(cmp, &arena); + for (int i = 0; i < N; i++) { + Key key = rnd.Next() % R; + if (keys.insert(key).second) { + list.Insert(key); + } + } + + for (int i = 0; i < R; i++) { + if (list.Contains(i)) { + ASSERT_EQ(keys.count(i), 1U); + } else { + ASSERT_EQ(keys.count(i), 0U); + } + } + + // Simple iterator tests + { + SkipList::Iterator iter(&list); + ASSERT_TRUE(!iter.Valid()); + + iter.Seek(0); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToFirst(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.begin()), iter.key()); + + iter.SeekToLast(); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + } + + // Forward iteration test + for (int i = 0; i < R; i++) { + SkipList::Iterator iter(&list); + iter.Seek(i); + + // Compare against model iterator + std::set::iterator model_iter = keys.lower_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.end()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + ++model_iter; + iter.Next(); + } + } + } + + // Backward iteration test + { + SkipList::Iterator iter(&list); + iter.SeekToLast(); + + // Compare against model iterator + for (std::set::reverse_iterator model_iter = keys.rbegin(); + model_iter != keys.rend(); + ++model_iter) { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*model_iter, iter.key()); + iter.Prev(); + } + ASSERT_TRUE(!iter.Valid()); + } +} + +// We want to make sure that with a single writer and multiple +// concurrent readers (with no synchronization other than when a +// reader's iterator is created), the reader always observes all the +// data that was present in the skip list when the iterator was +// constructor. Because insertions are happening concurrently, we may +// also observe new values that were inserted since the iterator was +// constructed, but we should never miss any values that were present +// at iterator construction time. +// +// We generate multi-part keys: +// +// where: +// key is in range [0..K-1] +// gen is a generation number for key +// hash is hash(key,gen) +// +// The insertion code picks a random key, sets gen to be 1 + the last +// generation number inserted for that key, and sets hash to Hash(key,gen). +// +// At the beginning of a read, we snapshot the last inserted +// generation number for each key. We then iterate, including random +// calls to Next() and Seek(). For every key we encounter, we +// check that it is either expected given the initial snapshot or has +// been concurrently added since the iterator started. +class ConcurrentTest { + private: + static const uint32_t K = 4; + + static uint64_t key(Key key) { return (key >> 40); } + static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } + static uint64_t hash(Key key) { return key & 0xff; } + + static uint64_t HashNumbers(uint64_t k, uint64_t g) { + uint64_t data[2] = { k, g }; + return Hash(reinterpret_cast(data), sizeof(data), 0); + } + + static Key MakeKey(uint64_t k, uint64_t g) { + assert(sizeof(Key) == sizeof(uint64_t)); + assert(k <= K); // We sometimes pass K to seek to the end of the skiplist + assert(g <= 0xffffffffu); + return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); + } + + static bool IsValidKey(Key k) { + return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); + } + + static Key RandomTarget(Random* rnd) { + switch (rnd->Next() % 10) { + case 0: + // Seek to beginning + return MakeKey(0, 0); + case 1: + // Seek to end + return MakeKey(K, 0); + default: + // Seek to middle + return MakeKey(rnd->Next() % K, 0); + } + } + + // Per-key generation + struct State { + port::AtomicPointer generation[K]; + void Set(int k, intptr_t v) { + generation[k].Release_Store(reinterpret_cast(v)); + } + intptr_t Get(int k) { + return reinterpret_cast(generation[k].Acquire_Load()); + } + + State() { + for (unsigned int k = 0; k < K; k++) { + Set(k, 0); + } + } + }; + + // Current state of the test + State current_; + + Arena arena_; + + // SkipList is not protected by mu_. We just use a single writer + // thread to modify it. + SkipList list_; + + public: + ConcurrentTest() : list_(TestComparator(), &arena_) {} + + // REQUIRES: External synchronization + void WriteStep(Random* rnd) { + const uint32_t k = rnd->Next() % K; + const intptr_t g = current_.Get(k) + 1; + const Key key = MakeKey(k, g); + list_.Insert(key); + current_.Set(k, g); + } + + void ReadStep(Random* rnd) { + // Remember the initial committed state of the skiplist. + State initial_state; + for (unsigned int k = 0; k < K; k++) { + initial_state.Set(k, current_.Get(k)); + } + + Key pos = RandomTarget(rnd); + SkipList::Iterator iter(&list_); + iter.Seek(pos); + while (true) { + Key current; + if (!iter.Valid()) { + current = MakeKey(K, 0); + } else { + current = iter.key(); + ASSERT_TRUE(IsValidKey(current)) << current; + } + ASSERT_LE(pos, current) << "should not go backwards"; + + // Verify that everything in [pos,current) was not present in + // initial_state. + while (pos < current) { + ASSERT_LT(key(pos), K) << pos; + + // Note that generation 0 is never inserted, so it is ok if + // <*,0,*> is missing. + ASSERT_TRUE((gen(pos) == 0U) || + (gen(pos) > (uint64_t)initial_state.Get(key(pos))) + ) << "key: " << key(pos) + << "; gen: " << gen(pos) + << "; initgen: " + << initial_state.Get(key(pos)); + + // Advance to next key in the valid key space + if (key(pos) < key(current)) { + pos = MakeKey(key(pos) + 1, 0); + } else { + pos = MakeKey(key(pos), gen(pos) + 1); + } + } + + if (!iter.Valid()) { + break; + } + + if (rnd->Next() % 2) { + iter.Next(); + pos = MakeKey(key(pos), gen(pos) + 1); + } else { + Key new_target = RandomTarget(rnd); + if (new_target > pos) { + pos = new_target; + iter.Seek(new_target); + } + } + } + } +}; +const uint32_t ConcurrentTest::K; + +// Simple test that does single-threaded testing of the ConcurrentTest +// scaffolding. +TEST(SkipTest, ConcurrentWithoutThreads) { + ConcurrentTest test; + Random rnd(test::RandomSeed()); + for (int i = 0; i < 10000; i++) { + test.ReadStep(&rnd); + test.WriteStep(&rnd); + } +} + +class TestState { + public: + ConcurrentTest t_; + int seed_; + port::AtomicPointer quit_flag_; + + enum ReaderState { + STARTING, + RUNNING, + DONE + }; + + explicit TestState(int s) + : seed_(s), + quit_flag_(nullptr), + state_(STARTING), + state_cv_(&mu_) {} + + void Wait(ReaderState s) { + mu_.Lock(); + while (state_ != s) { + state_cv_.Wait(); + } + mu_.Unlock(); + } + + void Change(ReaderState s) { + mu_.Lock(); + state_ = s; + state_cv_.Signal(); + mu_.Unlock(); + } + + private: + port::Mutex mu_; + ReaderState state_; + port::CondVar state_cv_; +}; + +static void ConcurrentReader(void* arg) { + TestState* state = reinterpret_cast(arg); + Random rnd(state->seed_); + int64_t reads = 0; + state->Change(TestState::RUNNING); + while (!state->quit_flag_.Acquire_Load()) { + state->t_.ReadStep(&rnd); + ++reads; + } + state->Change(TestState::DONE); +} + +static void RunConcurrent(int run) { + const int seed = test::RandomSeed() + (run * 100); + Random rnd(seed); + const int N = 1000; + const int kSize = 1000; + for (int i = 0; i < N; i++) { + if ((i % 100) == 0) { + fprintf(stderr, "Run %d of %d\n", i, N); + } + TestState state(seed + 1); + Env::Default()->Schedule(ConcurrentReader, &state); + state.Wait(TestState::RUNNING); + for (int i = 0; i < kSize; i++) { + state.t_.WriteStep(&rnd); + } + state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do + state.Wait(TestState::DONE); + } +} + +TEST(SkipTest, Concurrent1) { RunConcurrent(1); } +TEST(SkipTest, Concurrent2) { RunConcurrent(2); } +TEST(SkipTest, Concurrent3) { RunConcurrent(3); } +TEST(SkipTest, Concurrent4) { RunConcurrent(4); } +TEST(SkipTest, Concurrent5) { RunConcurrent(5); } + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/snapshot.h b/db/snapshot.h new file mode 100644 index 00000000..2c2e3eac --- /dev/null +++ b/db/snapshot.h @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" + +namespace rocksdb { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { + public: + SequenceNumber number_; // const after creation + + private: + friend class SnapshotList; + + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; + + SnapshotList* list_; // just for sanity checks +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + } + + bool empty() const { return list_.next_ == &list_; } + SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } + SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } + + const SnapshotImpl* New(SequenceNumber seq) { + SnapshotImpl* s = new SnapshotImpl; + s->number_ = seq; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + return s; + } + + void Delete(const SnapshotImpl* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + delete s; + } + + // retrieve all snapshot numbers. They are sorted in ascending order. + void getAll(std::vector& ret) { + if (empty()) return; + SnapshotImpl* s = &list_; + while (s->next_ != &list_) { + ret.push_back(s->next_->number_); + s = s ->next_; + } + } + + // get the sequence number of the most recent snapshot + const SequenceNumber GetNewest() { + if (empty()) { + return 0; + } + return newest()->number_; + } + + private: + // Dummy head of doubly-linked list of snapshots + SnapshotImpl list_; +}; + +} // namespace rocksdb diff --git a/db/table_cache.cc b/db/table_cache.cc new file mode 100644 index 00000000..dc7c60e3 --- /dev/null +++ b/db/table_cache.cc @@ -0,0 +1,229 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/filename.h" +#include "db/version_edit.h" + +#include "rocksdb/statistics.h" +#include "table/table_reader.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +static void DeleteEntry(const Slice& key, void* value) { + TableReader* table_reader = reinterpret_cast(value); + delete table_reader; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +static Slice GetSliceForFileNumber(uint64_t* file_number) { + return Slice(reinterpret_cast(file_number), + sizeof(*file_number)); +} + +TableCache::TableCache(const std::string& dbname, + const Options* options, + const EnvOptions& storage_options, + int entries) + : env_(options->env), + dbname_(dbname), + options_(options), + storage_options_(storage_options), + cache_( + NewLRUCache(entries, options->table_cache_numshardbits, + options->table_cache_remove_scan_count_limit)) { +} + +TableCache::~TableCache() { +} + +TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) { + return reinterpret_cast(cache_->Value(handle)); +} + +void TableCache::ReleaseHandle(Cache::Handle* handle) { + cache_->Release(handle); +} + +Status TableCache::FindTable(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_number, uint64_t file_size, + Cache::Handle** handle, bool* table_io, + const bool no_io) { + Status s; + Slice key = GetSliceForFileNumber(&file_number); + *handle = cache_->Lookup(key); + if (*handle == nullptr) { + if (no_io) { // Dont do IO and return a not-found status + return Status::Incomplete("Table not found in table_cache, no_io is set"); + } + if (table_io != nullptr) { + *table_io = true; // we had to do IO from storage + } + std::string fname = TableFileName(dbname_, file_number); + unique_ptr file; + unique_ptr table_reader; + s = env_->NewRandomAccessFile(fname, &file, toptions); + RecordTick(options_->statistics.get(), NO_FILE_OPENS); + if (s.ok()) { + if (options_->advise_random_on_open) { + file->Hint(RandomAccessFile::RANDOM); + } + StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); + s = options_->table_factory->NewTableReader( + *options_, toptions, internal_comparator, std::move(file), file_size, + &table_reader); + } + + if (!s.ok()) { + assert(table_reader == nullptr); + RecordTick(options_->statistics.get(), NO_FILE_ERRORS); + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + } else { + assert(file.get() == nullptr); + *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry); + } + } + return s; +} + +Iterator* TableCache::NewIterator(const ReadOptions& options, + const EnvOptions& toptions, + const InternalKeyComparator& icomparator, + const FileMetaData& file_meta, + TableReader** table_reader_ptr, + bool for_compaction) { + if (table_reader_ptr != nullptr) { + *table_reader_ptr = nullptr; + } + TableReader* table_reader = file_meta.table_reader; + Cache::Handle* handle = nullptr; + Status s; + if (table_reader == nullptr) { + s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size, + &handle, nullptr, options.read_tier == kBlockCacheTier); + if (!s.ok()) { + return NewErrorIterator(s); + } + table_reader = GetTableReaderFromHandle(handle); + } + + Iterator* result = table_reader->NewIterator(options); + if (handle != nullptr) { + result->RegisterCleanup(&UnrefEntry, cache_.get(), handle); + } + if (table_reader_ptr != nullptr) { + *table_reader_ptr = table_reader; + } + + if (for_compaction) { + table_reader->SetupForCompaction(); + } + + return result; +} + +Status TableCache::Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + bool* table_io, void (*mark_key_may_exist)(void*)) { + TableReader* t = file_meta.table_reader; + Status s; + Cache::Handle* handle = nullptr; + if (!t) { + s = FindTable(storage_options_, internal_comparator, file_meta.number, + file_meta.file_size, &handle, table_io, + options.read_tier == kBlockCacheTier); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok()) { + s = t->Get(options, k, arg, saver, mark_key_may_exist); + if (handle != nullptr) { + ReleaseHandle(handle); + } + } else if (options.read_tier && s.IsIncomplete()) { + // Couldnt find Table in cache but treat as kFound if no_io set + (*mark_key_may_exist)(arg); + return Status::OK(); + } + return s; +} +Status TableCache::GetTableProperties( + const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, bool no_io) { + Status s; + auto table_reader = file_meta.table_reader; + // table already been pre-loaded? + if (table_reader) { + *properties = table_reader->GetTableProperties(); + + return s; + } + + bool table_io; + Cache::Handle* table_handle = nullptr; + s = FindTable(toptions, internal_comparator, file_meta.number, + file_meta.file_size, &table_handle, &table_io, no_io); + if (!s.ok()) { + return s; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + *properties = table->GetTableProperties(); + ReleaseHandle(table_handle); + return s; +} + +bool TableCache::PrefixMayMatch(const ReadOptions& options, + const InternalKeyComparator& icomparator, + const FileMetaData& file_meta, + const Slice& internal_prefix, bool* table_io) { + bool may_match = true; + auto table_reader = file_meta.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + // Need to get table handle from file number + Status s = FindTable(storage_options_, icomparator, file_meta.number, + file_meta.file_size, &table_handle, table_io); + if (!s.ok()) { + return may_match; + } + table_reader = GetTableReaderFromHandle(table_handle); + } + + may_match = table_reader->PrefixMayMatch(internal_prefix); + + if (table_handle != nullptr) { + // Need to release handle if it is generated from here. + ReleaseHandle(table_handle); + } + + return may_match; +} + +void TableCache::Evict(uint64_t file_number) { + cache_->Erase(GetSliceForFileNumber(&file_number)); +} + +} // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h new file mode 100644 index 00000000..5f1c29ea --- /dev/null +++ b/db/table_cache.h @@ -0,0 +1,101 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#pragma once +#include +#include + +#include "db/dbformat.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" + +namespace rocksdb { + +class Env; +struct FileMetaData; + +// TODO(sdong): try to come up with a better API to pass the file information +// other than simply passing FileMetaData. +class TableCache { + public: + TableCache(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, int entries); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "tableptr" is + // non-nullptr, also sets "*tableptr" to point to the Table object + // underlying the returned iterator, or nullptr if no Table object underlies + // the returned iterator. The returned "*tableptr" object is owned by + // the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + TableReader** table_reader_ptr = nullptr, + bool for_compaction = false); + + // If a seek to internal key "k" in specified file finds an entry, + // call (*handle_result)(arg, found_key, found_value) repeatedly until + // it returns false. + Status Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, void* arg, + bool (*handle_result)(void*, const ParsedInternalKey&, + const Slice&, bool), + bool* table_io, void (*mark_key_may_exist)(void*) = nullptr); + + // Determine whether the table may contain the specified prefix. If + // the table index or blooms are not in memory, this may cause an I/O + bool PrefixMayMatch(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const Slice& internal_prefix, bool* table_io); + + // Evict any entry for the specified file number + void Evict(uint64_t file_number); + + // Find table reader + Status FindTable(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_number, uint64_t file_size, Cache::Handle**, + bool* table_io = nullptr, const bool no_io = false); + + // Get TableReader from a cache handle. + TableReader* GetTableReaderFromHandle(Cache::Handle* handle); + + // Get the table properties of a given table. + // @no_io: indicates if we should load table to the cache if it is not present + // in table cache yet. + // @returns: `properties` will be reset on success. Please note that we will + // return Status::Incomplete() if table is not present in cache and + // we set `no_io` to be true. + Status GetTableProperties(const EnvOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + bool no_io = false); + + // Release the handle from a cache + void ReleaseHandle(Cache::Handle* handle); + + private: + Env* const env_; + const std::string dbname_; + const Options* options_; + const EnvOptions& storage_options_; + std::shared_ptr cache_; +}; + +} // namespace rocksdb diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc new file mode 100644 index 00000000..25bd7003 --- /dev/null +++ b/db/table_properties_collector.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/table_properties_collector.h" + +#include "db/dbformat.h" +#include "util/coding.h" + +namespace rocksdb { + +Status InternalKeyPropertiesCollector::Add( + const Slice& key, const Slice& value) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + if (ikey.type == ValueType::kTypeDeletion) { + ++deleted_keys_; + } + + return Status::OK(); +} + +Status InternalKeyPropertiesCollector::Finish( + UserCollectedProperties* properties) { + assert(properties); + assert(properties->find( + InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end()); + std::string val; + + PutVarint64(&val, deleted_keys_); + properties->insert({ InternalKeyTablePropertiesNames::kDeletedKeys, val }); + + return Status::OK(); +} + +UserCollectedProperties +InternalKeyPropertiesCollector::GetReadableProperties() const { + return { + { "kDeletedKeys", std::to_string(deleted_keys_) } + }; +} + + +Status UserKeyTablePropertiesCollector::Add( + const Slice& key, const Slice& value) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + return collector_->Add(ikey.user_key, value); +} + +Status UserKeyTablePropertiesCollector::Finish( + UserCollectedProperties* properties) { + return collector_->Finish(properties); +} + +UserCollectedProperties +UserKeyTablePropertiesCollector::GetReadableProperties() const { + return collector_->GetReadableProperties(); +} + + +const std::string InternalKeyTablePropertiesNames::kDeletedKeys + = "rocksdb.deleted.keys"; + +uint64_t GetDeletedKeys( + const UserCollectedProperties& props) { + auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys); + if (pos == props.end()) { + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + return GetVarint64(&raw, &val) ? val : 0; +} + +} // namespace rocksdb diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h new file mode 100644 index 00000000..6cf56291 --- /dev/null +++ b/db/table_properties_collector.h @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file defines a collection of statistics collectors. +#pragma once + +#include "rocksdb/table_properties.h" + +#include +#include +#include + +namespace rocksdb { + +struct InternalKeyTablePropertiesNames { + static const std::string kDeletedKeys; +}; + +// Collecting the statistics for internal keys. Visible only by internal +// rocksdb modules. +class InternalKeyPropertiesCollector : public TablePropertiesCollector { + public: + virtual Status Add(const Slice& key, const Slice& value) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { + return "InternalKeyPropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override; + + private: + uint64_t deleted_keys_ = 0; +}; + +// When rocksdb creates a new table, it will encode all "user keys" into +// "internal keys", which contains meta information of a given entry. +// +// This class extracts user key from the encoded internal key when Add() is +// invoked. +class UserKeyTablePropertiesCollector : public TablePropertiesCollector { + public: + explicit UserKeyTablePropertiesCollector( + TablePropertiesCollector* collector) : + UserKeyTablePropertiesCollector( + std::shared_ptr(collector) + ) { + } + + explicit UserKeyTablePropertiesCollector( + std::shared_ptr collector) : + collector_(collector) { + } + + virtual ~UserKeyTablePropertiesCollector() { } + + virtual Status Add(const Slice& key, const Slice& value) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { return collector_->Name(); } + + UserCollectedProperties GetReadableProperties() const override; + + protected: + std::shared_ptr collector_; +}; + +} // namespace rocksdb diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc new file mode 100644 index 00000000..a9f770ca --- /dev/null +++ b/db/table_properties_collector_test.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "db/table_properties_collector.h" +#include "rocksdb/table.h" +#include "table/block_based_table_factory.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/table_builder.h" +#include "util/coding.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class TablePropertiesTest { +}; + +// TODO(kailiu) the following classes should be moved to some more general +// places, so that other tests can also make use of them. +// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system +// and therefore enable us to quickly setup the tests. +class FakeWritableFile : public WritableFile { + public: + ~FakeWritableFile() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class FakeRandomeAccessFile : public RandomAccessFile { + public: + explicit FakeRandomeAccessFile(const Slice& contents) + : contents_(contents.data(), contents.size()) { + } + + virtual ~FakeRandomeAccessFile() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class DumbLogger : public Logger { + public: + virtual void Logv(const char* format, va_list ap) { } + virtual size_t GetLogFileSize() const { return 0; } +}; + +// Utilities test functions +void MakeBuilder(const Options& options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr* writable, + std::unique_ptr* builder) { + writable->reset(new FakeWritableFile); + builder->reset(options.table_factory->NewTableBuilder( + options, internal_comparator, writable->get(), options.compression)); +} + +// Collects keys that starts with "A" in a table. +class RegularKeysStartWithA: public TablePropertiesCollector { + public: + const char* Name() const { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties { + { "TablePropertiesTest", "Rocksdb" }, + { "Count", encoded } + }; + return Status::OK(); + } + + Status Add(const Slice& user_key, const Slice& value) { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + virtual UserCollectedProperties GetReadableProperties() const { + return UserCollectedProperties{}; + } + + + private: + uint32_t count_ = 0; +}; + +extern uint64_t kBlockBasedTableMagicNumber; +extern uint64_t kPlainTableMagicNumber; +void TestCustomizedTablePropertiesCollector( + uint64_t magic_number, bool encode_as_internal, const Options& options, + const InternalKeyComparator& internal_comparator) { + // make sure the entries will be inserted with order. + std::map kvs = { + {"About ", "val5"}, // starts with 'A' + {"Abstract", "val2"}, // starts with 'A' + {"Around ", "val7"}, // starts with 'A' + {"Beyond ", "val3"}, + {"Builder ", "val1"}, + {"Cancel ", "val4"}, + {"Find ", "val6"}, + }; + + // -- Step 1: build table + std::unique_ptr builder; + std::unique_ptr writable; + MakeBuilder(options, internal_comparator, &writable, &builder); + + for (const auto& kv : kvs) { + if (encode_as_internal) { + InternalKey ikey(kv.first, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), kv.second); + } else { + builder->Add(kv.first, kv.second); + } + } + ASSERT_OK(builder->Finish()); + + // -- Step 2: Read properties + FakeRandomeAccessFile readable(writable->contents()); + TableProperties* props; + Status s = ReadTableProperties( + &readable, + writable->contents().size(), + magic_number, + Env::Default(), + nullptr, + &props + ); + std::unique_ptr props_guard(props); + ASSERT_OK(s); + + auto user_collected = props->user_collected_properties; + + ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest")); + + uint32_t starts_with_A = 0; + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(3u, starts_with_A); +} + +TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) { + // Test properties collectors with internal keys or regular keys + // for block based table + for (bool encode_as_internal : { true, false }) { + Options options; + auto collector = new RegularKeysStartWithA(); + if (encode_as_internal) { + options.table_properties_collectors = { + std::make_shared(collector) + }; + } else { + options.table_properties_collectors.resize(1); + options.table_properties_collectors[0].reset(collector); + } + test::PlainInternalKeyComparator ikc(options.comparator); + TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber, + encode_as_internal, options, ikc); + } + + // test plain table + Options options; + options.table_properties_collectors.push_back( + std::make_shared() + ); + options.table_factory = std::make_shared(8, 8, 0); + test::PlainInternalKeyComparator ikc(options.comparator); + TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options, + ikc); +} + +void TestInternalKeyPropertiesCollector( + uint64_t magic_number, + bool sanitized, + std::shared_ptr table_factory) { + InternalKey keys[] = { + InternalKey("A ", 0, ValueType::kTypeValue), + InternalKey("B ", 0, ValueType::kTypeValue), + InternalKey("C ", 0, ValueType::kTypeValue), + InternalKey("W ", 0, ValueType::kTypeDeletion), + InternalKey("X ", 0, ValueType::kTypeDeletion), + InternalKey("Y ", 0, ValueType::kTypeDeletion), + InternalKey("Z ", 0, ValueType::kTypeDeletion), + }; + + std::unique_ptr builder; + std::unique_ptr writable; + Options options; + test::PlainInternalKeyComparator pikc(options.comparator); + + options.table_factory = table_factory; + if (sanitized) { + options.table_properties_collectors = { + std::make_shared() + }; + // with sanitization, even regular properties collector will be able to + // handle internal keys. + auto comparator = options.comparator; + // HACK: Set options.info_log to avoid writing log in + // SanitizeOptions(). + options.info_log = std::make_shared(); + options = SanitizeOptions("db", // just a place holder + &pikc, nullptr, // don't care filter policy + options); + options.comparator = comparator; + } else { + options.table_properties_collectors = { + std::make_shared() + }; + } + + MakeBuilder(options, pikc, &writable, &builder); + for (const auto& k : keys) { + builder->Add(k.Encode(), "val"); + } + + ASSERT_OK(builder->Finish()); + + FakeRandomeAccessFile readable(writable->contents()); + TableProperties* props; + Status s = ReadTableProperties( + &readable, + writable->contents().size(), + magic_number, + Env::Default(), + nullptr, + &props + ); + ASSERT_OK(s); + + std::unique_ptr props_guard(props); + auto user_collected = props->user_collected_properties; + uint64_t deleted = GetDeletedKeys(user_collected); + ASSERT_EQ(4u, deleted); + + if (sanitized) { + uint32_t starts_with_A = 0; + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(1u, starts_with_A); + } +} + +TEST(TablePropertiesTest, InternalKeyPropertiesCollector) { + TestInternalKeyPropertiesCollector( + kBlockBasedTableMagicNumber, + true /* sanitize */, + std::make_shared() + ); + TestInternalKeyPropertiesCollector( + kBlockBasedTableMagicNumber, + true /* not sanitize */, + std::make_shared() + ); + TestInternalKeyPropertiesCollector( + kPlainTableMagicNumber, + false /* not sanitize */, + std::make_shared(8, 8, 0) + ); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/tailing_iter.cc b/db/tailing_iter.cc new file mode 100644 index 00000000..7264b43a --- /dev/null +++ b/db/tailing_iter.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/tailing_iter.h" + +#include +#include +#include "db/db_impl.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +TailingIterator::TailingIterator(DBImpl* db, const ReadOptions& options, + const Comparator* comparator) + : db_(db), options_(options), comparator_(comparator), + version_number_(0), current_(nullptr), + status_(Status::InvalidArgument("Seek() not called on this iterator")) {} + +bool TailingIterator::Valid() const { + return current_ != nullptr; +} + +void TailingIterator::SeekToFirst() { + if (!IsCurrentVersion()) { + CreateIterators(); + } + + mutable_->SeekToFirst(); + immutable_->SeekToFirst(); + UpdateCurrent(); +} + +void TailingIterator::Seek(const Slice& target) { + if (!IsCurrentVersion()) { + CreateIterators(); + } + + mutable_->Seek(target); + + // We maintain the interval (prev_key_, immutable_->key()] such that there + // are no records with keys within that range in immutable_ other than + // immutable_->key(). Since immutable_ can't change in this version, we don't + // need to do a seek if 'target' belongs to that interval (i.e. immutable_ is + // already at the correct position)! + // + // If options.prefix_seek is used and immutable_ is not valid, seek if target + // has a different prefix than prev_key. + // + // prev_key_ is updated by Next(). SeekImmutable() sets prev_key_ to + // 'target' -- in this case, prev_key_ is included in the interval, so + // prev_inclusive_ has to be set. + + if (!is_prev_set_ || + comparator_->Compare(prev_key_, target) >= !is_prev_inclusive_ || + (immutable_->Valid() && + comparator_->Compare(target, immutable_->key()) > 0) || + (options_.prefix_seek && !IsSamePrefix(target))) { + SeekImmutable(target); + } + + UpdateCurrent(); +} + +void TailingIterator::Next() { + assert(Valid()); + + if (!IsCurrentVersion()) { + // save the current key, create new iterators and then seek + std::string current_key = key().ToString(); + Slice key_slice(current_key.data(), current_key.size()); + + CreateIterators(); + Seek(key_slice); + + if (!Valid() || key().compare(key_slice) != 0) { + // record with current_key no longer exists + return; + } + + } else if (current_ == immutable_.get()) { + // immutable iterator is advanced -- update prev_key_ + prev_key_ = key().ToString(); + is_prev_inclusive_ = false; + is_prev_set_ = true; + } + + current_->Next(); + UpdateCurrent(); +} + +Slice TailingIterator::key() const { + assert(Valid()); + return current_->key(); +} + +Slice TailingIterator::value() const { + assert(Valid()); + return current_->value(); +} + +Status TailingIterator::status() const { + if (!status_.ok()) { + return status_; + } else if (!mutable_->status().ok()) { + return mutable_->status(); + } else { + return immutable_->status(); + } +} + +void TailingIterator::Prev() { + status_ = Status::NotSupported("This iterator doesn't support Prev()"); +} + +void TailingIterator::SeekToLast() { + status_ = Status::NotSupported("This iterator doesn't support SeekToLast()"); +} + +void TailingIterator::CreateIterators() { + std::pair iters = + db_->GetTailingIteratorPair(options_, &version_number_); + + assert(iters.first && iters.second); + + mutable_.reset(iters.first); + immutable_.reset(iters.second); + current_ = nullptr; + is_prev_set_ = false; +} + +void TailingIterator::UpdateCurrent() { + current_ = nullptr; + + if (mutable_->Valid()) { + current_ = mutable_.get(); + } + if (immutable_->Valid() && + (current_ == nullptr || + comparator_->Compare(immutable_->key(), current_->key()) < 0)) { + current_ = immutable_.get(); + } + + if (!status_.ok()) { + // reset status that was set by Prev() or SeekToLast() + status_ = Status::OK(); + } +} + +bool TailingIterator::IsCurrentVersion() const { + return mutable_ != nullptr && immutable_ != nullptr && + version_number_ == db_->CurrentVersionNumber(); +} + +bool TailingIterator::IsSamePrefix(const Slice& target) const { + const SliceTransform* extractor = db_->options_.prefix_extractor.get(); + + assert(extractor); + assert(is_prev_set_); + + return extractor->Transform(target) + .compare(extractor->Transform(prev_key_)) == 0; +} + +void TailingIterator::SeekImmutable(const Slice& target) { + prev_key_ = target.ToString(); + is_prev_inclusive_ = true; + is_prev_set_ = true; + + immutable_->Seek(target); +} + +} // namespace rocksdb diff --git a/db/tailing_iter.h b/db/tailing_iter.h new file mode 100644 index 00000000..3b8343a2 --- /dev/null +++ b/db/tailing_iter.h @@ -0,0 +1,88 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include + +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +class DBImpl; + +/** + * TailingIterator is a special type of iterator that doesn't use an (implicit) + * snapshot. In other words, it can be used to read data that was added to the + * db after the iterator had been created. + * + * TailingIterator is optimized for sequential reading. It doesn't support + * Prev() and SeekToLast() operations. + */ +class TailingIterator : public Iterator { + public: + TailingIterator(DBImpl* db, const ReadOptions& options, + const Comparator* comparator); + virtual ~TailingIterator() {} + + virtual bool Valid() const override; + virtual void SeekToFirst() override; + virtual void SeekToLast() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual void Prev() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + + private: + DBImpl* const db_; + const ReadOptions options_; + const Comparator* const comparator_; + uint64_t version_number_; + + // TailingIterator merges the contents of the two iterators below (one using + // mutable memtable contents only, other over SSTs and immutable memtables). + // See DBIter::GetTailingIteratorPair(). + std::unique_ptr mutable_; + std::unique_ptr immutable_; + + // points to either mutable_ or immutable_ + Iterator* current_; + + // key that precedes immutable iterator's current key + std::string prev_key_; + + // unless prev_set is true, prev_key/prev_head is not valid and shouldn't be + // used; reset by createIterators() + bool is_prev_set_; + + // prev_key_ was set by SeekImmutable(), which means that the interval of + // keys covered by immutable_ is [prev_key_, current], i.e. it includes the + // left endpoint + bool is_prev_inclusive_; + + // internal iterator status + Status status_; + + // check if this iterator's version matches DB's version + bool IsCurrentVersion() const; + + // check if SeekImmutable() is needed due to target having a different prefix + // than prev_key_ (used when options.prefix_seek is set) + bool IsSamePrefix(const Slice& target) const; + + // creates mutable_ and immutable_ iterators and updates version_number_ + void CreateIterators(); + + // set current_ to be one of the iterators with the smallest key + void UpdateCurrent(); + + // seek on immutable_ and update prev_key + void SeekImmutable(const Slice& target); +}; + +} // namespace rocksdb diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc new file mode 100644 index 00000000..36b8932a --- /dev/null +++ b/db/transaction_log_impl.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/transaction_log_impl.h" +#include "db/write_batch_internal.h" + +namespace rocksdb { + +TransactionLogIteratorImpl::TransactionLogIteratorImpl( + const std::string& dir, const Options* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seq, + std::unique_ptr files, DBImpl const* const dbimpl) + : dir_(dir), + options_(options), + read_options_(read_options), + soptions_(soptions), + startingSequenceNumber_(seq), + files_(std::move(files)), + started_(false), + isValid_(false), + currentFileIndex_(0), + currentBatchSeq_(0), + currentLastSeq_(0), + dbimpl_(dbimpl) { + assert(files_ != nullptr); + assert(dbimpl_ != nullptr); + + reporter_.env = options_->env; + reporter_.info_log = options_->info_log.get(); + SeekToStartSequence(); // Seek till starting sequence +} + +Status TransactionLogIteratorImpl::OpenLogFile( + const LogFile* logFile, + unique_ptr* file) { + Env* env = options_->env; + if (logFile->Type() == kArchivedLogFile) { + std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber()); + return env->NewSequentialFile(fname, file, soptions_); + } else { + std::string fname = LogFileName(dir_, logFile->LogNumber()); + Status status = env->NewSequentialFile(fname, file, soptions_); + if (!status.ok()) { + // If cannot open file in DB directory. + // Try the archive dir, as it could have moved in the meanwhile. + fname = ArchivedLogFileName(dir_, logFile->LogNumber()); + status = env->NewSequentialFile(fname, file, soptions_); + } + return status; + } +} + +BatchResult TransactionLogIteratorImpl::GetBatch() { + assert(isValid_); // cannot call in a non valid state. + BatchResult result; + result.sequence = currentBatchSeq_; + result.writeBatchPtr = std::move(currentBatch_); + return result; +} + +Status TransactionLogIteratorImpl::status() { + return currentStatus_; +} + +bool TransactionLogIteratorImpl::Valid() { + return started_ && isValid_; +} + +bool TransactionLogIteratorImpl::RestrictedRead( + Slice* record, + std::string* scratch) { + // Don't read if no more complete entries to read from logs + if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) { + return false; + } + return currentLogReader_->ReadRecord(record, scratch); +} + +void TransactionLogIteratorImpl::SeekToStartSequence( + uint64_t startFileIndex, + bool strict) { + std::string scratch; + Slice record; + started_ = false; + isValid_ = false; + if (files_->size() <= startFileIndex) { + return; + } + Status s = OpenLogReader(files_->at(startFileIndex).get()); + if (!s.ok()) { + currentStatus_ = s; + return; + } + while (RestrictedRead(&record, &scratch)) { + if (record.size() < 12) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } + UpdateCurrentWriteBatch(record); + if (currentLastSeq_ >= startingSequenceNumber_) { + if (strict && currentBatchSeq_ != startingSequenceNumber_) { + currentStatus_ = Status::Corruption("Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(currentStatus_.ToString().c_str()); + return; + } else if (strict) { + reporter_.Info("Could seek required sequence number. Iterator will " + "continue."); + } + isValid_ = true; + started_ = true; // set started_ as we could seek till starting sequence + return; + } else { + isValid_ = false; + } + } + + // Could not find start sequence in first file. Normally this must be the + // only file. Otherwise log the error and let the iterator return next entry + // If strict is set, we want to seek exactly till the start sequence and it + // should have been present in the file we scanned above + if (strict) { + currentStatus_ = Status::Corruption("Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(currentStatus_.ToString().c_str()); + } else if (files_->size() != 1) { + currentStatus_ = Status::Corruption("Start sequence was not found, " + "skipping to the next available"); + reporter_.Info(currentStatus_.ToString().c_str()); + // Let NextImpl find the next available entry. started_ remains false + // because we don't want to check for gaps while moving to start sequence + NextImpl(true); + } +} + +void TransactionLogIteratorImpl::Next() { + return NextImpl(false); +} + +void TransactionLogIteratorImpl::NextImpl(bool internal) { + std::string scratch; + Slice record; + isValid_ = false; + if (!internal && !started_) { + // Runs every time until we can seek to the start sequence + return SeekToStartSequence(); + } + while(true) { + assert(currentLogReader_); + if (currentLogReader_->IsEOF()) { + currentLogReader_->UnmarkEOF(); + } + while (RestrictedRead(&record, &scratch)) { + if (record.size() < 12) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } else { + // started_ should be true if called by application + assert(internal || started_); + // started_ should be false if called internally + assert(!internal || !started_); + UpdateCurrentWriteBatch(record); + if (internal && !started_) { + started_ = true; + } + return; + } + } + + // Open the next file + if (currentFileIndex_ < files_->size() - 1) { + ++currentFileIndex_; + Status status =OpenLogReader(files_->at(currentFileIndex_).get()); + if (!status.ok()) { + isValid_ = false; + currentStatus_ = status; + return; + } + } else { + isValid_ = false; + if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) { + currentStatus_ = Status::OK(); + } else { + currentStatus_ = Status::Corruption("NO MORE DATA LEFT"); + } + return; + } + } +} + +bool TransactionLogIteratorImpl::IsBatchExpected( + const WriteBatch* batch, + const SequenceNumber expectedSeq) { + assert(batch); + SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch); + if (batchSeq != expectedSeq) { + char buf[200]; + snprintf(buf, sizeof(buf), + "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, " + "Last flushed seq=%lu.Log iterator will reseek the correct " + "batch.", + (unsigned long)batchSeq, + (unsigned long)expectedSeq, + (unsigned long)dbimpl_->GetLatestSequenceNumber()); + reporter_.Info(buf); + return false; + } + return true; +} + +void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { + std::unique_ptr batch(new WriteBatch()); + WriteBatchInternal::SetContents(batch.get(), record); + + SequenceNumber expectedSeq = currentLastSeq_ + 1; + // If the iterator has started, then confirm that we get continuous batches + if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) { + // Seek to the batch having expected sequence number + if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) { + // Expected batch must lie in the previous log file + // Avoid underflow. + if (currentFileIndex_ != 0) { + currentFileIndex_--; + } + } + startingSequenceNumber_ = expectedSeq; + // currentStatus_ will be set to Ok if reseek succeeds + currentStatus_ = Status::NotFound("Gap in sequence numbers"); + return SeekToStartSequence(currentFileIndex_, true); + } + + currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get()); + currentLastSeq_ = currentBatchSeq_ + + WriteBatchInternal::Count(batch.get()) - 1; + // currentBatchSeq_ can only change here + assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber()); + + currentBatch_ = move(batch); + isValid_ = true; + currentStatus_ = Status::OK(); +} + +Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { + unique_ptr file; + Status status = OpenLogFile(logFile, &file); + if (!status.ok()) { + return status; + } + assert(file); + currentLogReader_.reset(new log::Reader(std::move(file), &reporter_, + read_options_.verify_checksums_, 0)); + return Status::OK(); +} +} // namespace rocksdb diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h new file mode 100644 index 00000000..6454d89e --- /dev/null +++ b/db/transaction_log_impl.h @@ -0,0 +1,118 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "db/db_impl.h" +#include "db/log_reader.h" +#include "db/filename.h" + +namespace rocksdb { + +struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) { + Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str()); + } + virtual void Info(const char* s) { + Log(info_log, "%s", s); + } +}; + +class LogFileImpl : public LogFile { + public: + LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, + uint64_t sizeBytes) : + logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) { + } + + std::string PathName() const { + if (type_ == kArchivedLogFile) { + return ArchivedLogFileName("", logNumber_); + } + return LogFileName("", logNumber_); + } + + uint64_t LogNumber() const { return logNumber_; } + + WalFileType Type() const { return type_; } + + SequenceNumber StartSequence() const { return startSequence_; } + + uint64_t SizeFileBytes() const { return sizeFileBytes_; } + + bool operator < (const LogFile& that) const { + return LogNumber() < that.LogNumber(); + } + + private: + uint64_t logNumber_; + WalFileType type_; + SequenceNumber startSequence_; + uint64_t sizeFileBytes_; + +}; + +class TransactionLogIteratorImpl : public TransactionLogIterator { + public: + TransactionLogIteratorImpl( + const std::string& dir, const Options* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seqNum, + std::unique_ptr files, DBImpl const* const dbimpl); + + virtual bool Valid(); + + virtual void Next(); + + virtual Status status(); + + virtual BatchResult GetBatch(); + + private: + const std::string& dir_; + const Options* options_; + const TransactionLogIterator::ReadOptions read_options_; + const EnvOptions& soptions_; + SequenceNumber startingSequenceNumber_; + std::unique_ptr files_; + bool started_; + bool isValid_; // not valid when it starts of. + Status currentStatus_; + size_t currentFileIndex_; + std::unique_ptr currentBatch_; + unique_ptr currentLogReader_; + Status OpenLogFile(const LogFile* logFile, unique_ptr* file); + LogReporter reporter_; + SequenceNumber currentBatchSeq_; // sequence number at start of current batch + SequenceNumber currentLastSeq_; // last sequence in the current batch + DBImpl const * const dbimpl_; // The db on whose log files this iterates + + // Reads from transaction log only if the writebatch record has been written + bool RestrictedRead(Slice* record, std::string* scratch); + // Seeks to startingSequenceNumber reading from startFileIndex in files_. + // If strict is set,then must get a batch starting with startingSequenceNumber + void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false); + // Implementation of Next. SeekToStartSequence calls it internally with + // internal=true to let it find next entry even if it has to jump gaps because + // the iterator may start off from the first available entry but promises to + // be continuous after that + void NextImpl(bool internal = false); + // Check if batch is expected, else return false + bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq); + // Update current batch if a continuous batch is found, else return false + void UpdateCurrentWriteBatch(const Slice& record); + Status OpenLogReader(const LogFile* file); +}; +} // namespace rocksdb diff --git a/db/version_edit.cc b/db/version_edit.cc new file mode 100644 index 00000000..5c532b13 --- /dev/null +++ b/db/version_edit.cc @@ -0,0 +1,289 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/version_set.h" +#include "util/coding.h" + +namespace rocksdb { + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. +enum Tag { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100, // store smallest & largest seqno +}; + +void VersionEdit::Clear() { + comparator_.clear(); + max_level_ = 0; + log_number_ = 0; + prev_log_number_ = 0; + last_sequence_ = 0; + next_file_number_ = 0; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); +} + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32(dst, kLogNumber); + PutVarint64(dst, log_number_); + } + if (has_prev_log_number_) { + PutVarint32(dst, kPrevLogNumber); + PutVarint64(dst, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32(dst, kNextFileNumber); + PutVarint64(dst, next_file_number_); + } + if (has_last_sequence_) { + PutVarint32(dst, kLastSequence); + PutVarint64(dst, last_sequence_); + } + + for (const auto& deleted : deleted_files_) { + PutVarint32(dst, kDeletedFile); + PutVarint32(dst, deleted.first /* level */); + PutVarint64(dst, deleted.second /* file number */); + } + + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + PutVarint32(dst, kNewFile2); + PutVarint32(dst, new_files_[i].first); // level + PutVarint64(dst, f.number); + PutVarint64(dst, f.file_size); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + PutVarint64(dst, f.smallest_seqno); + PutVarint64(dst, f.largest_seqno); + } +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return true; + } else { + return false; + } +} + +bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { + uint32_t v; + if (GetVarint32(input, &v)) { + *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } + return true; + } else { + return false; + } +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = nullptr; + uint32_t tag; + + // Temporary storage for parsing + int level; + uint64_t number; + FileMetaData f; + Slice str; + InternalKey key; + + while (msg == nullptr && GetVarint32(&input, &tag)) { + switch (tag) { + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level, &msg) && + GetInternalKey(&input, &key)) { + // we don't use compact pointers anymore, + // but we should not fail if they are still + // in manifest + } else { + if (!msg) { + msg = "compaction pointer"; + } + } + break; + + case kDeletedFile: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + if (!msg) { + msg = "deleted file"; + } + } + break; + + case kNewFile: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file entry"; + } + } + break; + + case kNewFile2: + if (GetLevel(&input, &level, &msg) && + GetVarint64(&input, &f.number) && + GetVarint64(&input, &f.file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &f.smallest_seqno) && + GetVarint64(&input, &f.largest_seqno) ) { + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file2 entry"; + } + } + break; + + default: + msg = "unknown tag"; + break; + } + } + + if (msg == nullptr && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != nullptr) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString(bool hex_key) const { + std::string r; + r.append("VersionEdit {"); + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFile: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); + iter != deleted_files_.end(); + ++iter) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, iter->first); + r.append(" "); + AppendNumberTo(&r, iter->second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.number); + r.append(" "); + AppendNumberTo(&r, f.file_size); + r.append(" "); + r.append(f.smallest.DebugString(hex_key)); + r.append(" .. "); + r.append(f.largest.DebugString(hex_key)); + } + r.append("\n}\n"); + return r; +} + +} // namespace rocksdb diff --git a/db/version_edit.h b/db/version_edit.h new file mode 100644 index 00000000..2bbd0754 --- /dev/null +++ b/db/version_edit.h @@ -0,0 +1,131 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "rocksdb/cache.h" +#include "db/dbformat.h" + +namespace rocksdb { + +class VersionSet; + +struct FileMetaData { + int refs; + int allowed_seeks; // Seeks allowed until compaction + uint64_t number; + uint64_t file_size; // File size in bytes + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + bool being_compacted; // Is this file undergoing compaction? + SequenceNumber smallest_seqno;// The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file + + // Needs to be disposed when refs becomes 0. + Cache::Handle* table_reader_handle; + // Table reader in table_reader_handle + TableReader* table_reader; + + FileMetaData(uint64_t number, uint64_t file_size) : + refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size), + being_compacted(false), table_reader_handle(nullptr), + table_reader(nullptr) { + } + FileMetaData() : FileMetaData(0, 0) { } +}; + +class VersionEdit { + public: + VersionEdit() { Clear(); } + ~VersionEdit() { } + + void Clear(); + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + + // Add the specified file at the specified number. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + void AddFile(int level, uint64_t file, + uint64_t file_size, + const InternalKey& smallest, + const InternalKey& largest, + const SequenceNumber& smallest_seqno, + const SequenceNumber& largest_seqno) { + assert(smallest_seqno <= largest_seqno); + FileMetaData f; + f.number = file; + f.file_size = file_size; + f.smallest = smallest; + f.largest = largest; + f.smallest_seqno = smallest_seqno; + f.largest_seqno = largest_seqno; + new_files_.push_back(std::make_pair(level, f)); + } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.insert({level, file}); + } + + // Number of edits + int NumEntries() { + return new_files_.size() + deleted_files_.size(); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString(bool hex_key = false) const; + + private: + friend class VersionSet; + + typedef std::set< std::pair> DeletedFileSet; + + bool GetLevel(Slice* input, int* level, const char** msg); + + int max_level_; + std::string comparator_; + uint64_t log_number_; + uint64_t prev_log_number_; + uint64_t next_file_number_; + SequenceNumber last_sequence_; + bool has_comparator_; + bool has_log_number_; + bool has_prev_log_number_; + bool has_next_file_number_; + bool has_last_sequence_; + + DeletedFileSet deleted_files_; + std::vector > new_files_; +}; + +} // namespace rocksdb diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc new file mode 100644 index 00000000..110b422f --- /dev/null +++ b/db/version_edit_test.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "util/testharness.h" + +namespace rocksdb { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest { }; + +TEST(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, + InternalKey("foo", kBig + 500 + i, kTypeValue), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion), + kBig + 500 + i, + kBig + 600 + i); + edit.DeleteFile(4, kBig + 700 + i); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/version_set.cc b/db/version_set.cc new file mode 100644 index 00000000..61cdfab2 --- /dev/null +++ b/db/version_set.cc @@ -0,0 +1,2392 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#define __STDC_FORMAT_MACROS +#include "db/version_set.h" + +#include +#include +#include +#include + +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/table_cache.h" +#include "db/compaction.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "table/table_reader.h" +#include "table/merger.h" +#include "table/two_level_iterator.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Version::~Version() { + assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + for (int level = 0; level < num_levels_; level++) { + for (size_t i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + assert(f->refs > 0); + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + vset_->table_cache_->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + vset_->obsolete_files_.push_back(f); + } + } + } + delete[] files_; +} + +int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key) { + uint32_t left = 0; + uint32_t right = files.size(); + while (left < right) { + uint32_t mid = (left + right) / 2; + const FileMetaData* f = files[mid]; + if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) { + // Key at "mid.largest" is < "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + return right; +} + +static bool AfterFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // nullptr user_key occurs before all keys and is therefore never after *f + return (user_key != nullptr && + ucmp->Compare(*user_key, f->largest.user_key()) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // nullptr user_key occurs after all keys and is therefore never before *f + return (user_key != nullptr && + ucmp->Compare(*user_key, f->smallest.user_key()) < 0); +} + +bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const std::vector& files, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (size_t i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != nullptr) { + // Find the earliest possible internal key for smallest_user_key + InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); + index = FindFile(icmp, files, small.Encode()); + } + + if (index >= files.size()) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, files[index]); +} + +namespace { +// Used for LevelFileNumIterator to pass "block handle" value, +// which actually means file information in this iterator. +// It contains subset of fields of FileMetaData, that is sufficient +// for table cache to use. +struct EncodedFileMetaData { + uint64_t number; // file number + uint64_t file_size; // file size + TableReader* table_reader; // cached table reader +}; +} // namespace + +// An internal iterator. For a given version/level pair, yields +// information about the files in the level. For a given entry, key() +// is the largest key that occurs in the file, and value() is an +// 16-byte value containing the file number and file size, both +// encoded using EncodeFixed64. +class Version::LevelFileNumIterator : public Iterator { + public: + LevelFileNumIterator(const InternalKeyComparator& icmp, + const std::vector* flist) + : icmp_(icmp), + flist_(flist), + index_(flist->size()) { // Marks as invalid + } + virtual bool Valid() const { + return index_ < flist_->size(); + } + virtual void Seek(const Slice& target) { + index_ = FindFile(icmp_, *flist_, target); + } + virtual void SeekToFirst() { index_ = 0; } + virtual void SeekToLast() { + index_ = flist_->empty() ? 0 : flist_->size() - 1; + } + virtual void Next() { + assert(Valid()); + index_++; + } + virtual void Prev() { + assert(Valid()); + if (index_ == 0) { + index_ = flist_->size(); // Marks as invalid + } else { + index_--; + } + } + Slice key() const { + assert(Valid()); + return (*flist_)[index_]->largest.Encode(); + } + Slice value() const { + assert(Valid()); + auto* file_meta = (*flist_)[index_]; + current_value_.number = file_meta->number; + current_value_.file_size = file_meta->file_size; + current_value_.table_reader = file_meta->table_reader; + return Slice(reinterpret_cast(¤t_value_), + sizeof(EncodedFileMetaData)); + } + virtual Status status() const { return Status::OK(); } + private: + const InternalKeyComparator icmp_; + const std::vector* const flist_; + uint32_t index_; + mutable EncodedFileMetaData current_value_; +}; + +static Iterator* GetFileIterator(void* arg, const ReadOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& icomparator, + const Slice& file_value, bool for_compaction) { + TableCache* cache = reinterpret_cast(arg); + if (file_value.size() != sizeof(EncodedFileMetaData)) { + return NewErrorIterator( + Status::Corruption("FileReader invoked with unexpected value")); + } else { + ReadOptions options_copy; + if (options.prefix) { + // suppress prefix filtering since we have already checked the + // filters once at this point + options_copy = options; + options_copy.prefix = nullptr; + } + + const EncodedFileMetaData* encoded_meta = + reinterpret_cast(file_value.data()); + FileMetaData meta(encoded_meta->number, encoded_meta->file_size); + meta.table_reader = encoded_meta->table_reader; + return cache->NewIterator( + options.prefix ? options_copy : options, soptions, icomparator, meta, + nullptr /* don't need reference to table*/, for_compaction); + } +} + +bool Version::PrefixMayMatch(const ReadOptions& options, + const EnvOptions& soptions, + const Slice& internal_prefix, + Iterator* level_iter) const { + bool may_match = true; + level_iter->Seek(internal_prefix); + if (!level_iter->Valid()) { + // we're past end of level + may_match = false; + } else if (ExtractUserKey(level_iter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // TODO(tylerharter): do we need this case? Or are we guaranteed + // key() will always be the biggest value for this SST? + may_match = true; + } else { + const EncodedFileMetaData* encoded_meta = + reinterpret_cast( + level_iter->value().data()); + FileMetaData meta(encoded_meta->number, encoded_meta->file_size); + meta.table_reader = encoded_meta->table_reader; + may_match = vset_->table_cache_->PrefixMayMatch(options, vset_->icmp_, meta, + internal_prefix, nullptr); + } + return may_match; +} + +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + auto table_cache = vset_->table_cache_; + auto options = vset_->options_; + for (int level = 0; level < num_levels_; level++) { + for (const auto& file_meta : files_[level]) { + auto fname = TableFileName(vset_->dbname_, file_meta->number); + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = table_cache->GetTableProperties( + vset_->storage_options_, vset_->icmp_, *file_meta, &table_properties, + true /* no io */); + if (s.ok()) { + props->insert({fname, table_properties}); + continue; + } + + // We only ignore error type `Incomplete` since it's by design that we + // disallow table when it's not in table cache. + if (!s.IsIncomplete()) { + return s; + } + + // 2. Table is not present in table cache, we'll read the table properties + // directly from the properties block in the file. + std::unique_ptr file; + s = vset_->env_->NewRandomAccessFile(fname, &file, + vset_->storage_options_); + if (!s.ok()) { + return s; + } + + TableProperties* raw_table_properties; + // By setting the magic number to kInvalidTableMagicNumber, we can by + // pass the magic number check in the footer. + s = ReadTableProperties( + file.get(), file_meta->file_size, + Footer::kInvalidTableMagicNumber /* table's magic number */, + vset_->env_, options->info_log.get(), &raw_table_properties); + if (!s.ok()) { + return s; + } + RecordTick(options->statistics.get(), + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + + props->insert({fname, std::shared_ptr( + raw_table_properties)}); + } + } + + return Status::OK(); +} + +Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, + const EnvOptions& soptions, + int level) const { + Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]); + if (options.prefix) { + InternalKey internal_prefix(*options.prefix, 0, kTypeValue); + if (!PrefixMayMatch(options, soptions, + internal_prefix.Encode(), level_iter)) { + delete level_iter; + // nothing in this level can match the prefix + return NewEmptyIterator(); + } + } + return NewTwoLevelIterator(level_iter, &GetFileIterator, vset_->table_cache_, + options, soptions, vset_->icmp_); +} + +void Version::AddIterators(const ReadOptions& options, + const EnvOptions& soptions, + std::vector* iters) { + // Merge all level zero files together since they may overlap + for (const FileMetaData* file : files_[0]) { + iters->push_back(vset_->table_cache_->NewIterator(options, soptions, + vset_->icmp_, *file)); + } + + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + for (int level = 1; level < num_levels_; level++) { + if (!files_[level].empty()) { + iters->push_back(NewConcatenatingIterator(options, soptions, level)); + } + } +} + +// Callback from TableCache::Get() +namespace { +enum SaverState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge // saver contains the current merge result (the operands) +}; +struct Saver { + SaverState state; + const Comparator* ucmp; + Slice user_key; + bool* value_found; // Is value set correctly? Used by KeyMayExist + std::string* value; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + Logger* logger; + bool didIO; // did we do any disk io? + Statistics* statistics; +}; +} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +static void MarkKeyMayExist(void* arg) { + Saver* s = reinterpret_cast(arg); + s->state = kFound; + if (s->value_found != nullptr) { + *(s->value_found) = false; + } +} + +static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool didIO) { + Saver* s = reinterpret_cast(arg); + MergeContext* merge_contex = s->merge_context; + std::string merge_result; // temporary area for merge results later + + assert(s != nullptr && merge_contex != nullptr); + + // TODO: didIO and Merge? + s->didIO = didIO; + if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + if (kNotFound == s->state) { + s->state = kFound; + s->value->assign(v.data(), v.size()); + } else if (kMerge == s->state) { + assert(s->merge_operator != nullptr); + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, &v, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeDeletion: + if (kNotFound == s->state) { + s->state = kDeleted; + } else if (kMerge == s->state) { + s->state = kFound; + if (!s->merge_operator->FullMerge(s->user_key, nullptr, + merge_contex->GetOperands(), + s->value, s->logger)) { + RecordTick(s->statistics, NUMBER_MERGE_FAILURES); + s->state = kCorrupt; + } + } else { + assert(false); + } + return false; + + case kTypeMerge: + assert(s->state == kNotFound || s->state == kMerge); + s->state = kMerge; + merge_contex->PushOperand(v); + return true; + + default: + assert(false); + break; + } + } + + // s->state could be Corrupt, merge or notfound + + return false; +} + +namespace { +bool NewestFirst(FileMetaData* a, FileMetaData* b) { + return a->number > b->number; +} +bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { + if (a->smallest_seqno != b->smallest_seqno) { + return a->smallest_seqno > b->smallest_seqno; + } + if (a->largest_seqno != b->largest_seqno) { + return a->largest_seqno > b->largest_seqno; + } + // Break ties by file number + return NewestFirst(a, b); +} +bool BySmallestKey(FileMetaData* a, FileMetaData* b, + const InternalKeyComparator* cmp) { + int r = cmp->Compare(a->smallest, b->smallest); + if (r != 0) { + return (r < 0); + } + // Break ties by file number + return (a->number < b->number); +} +} // anonymous namespace + +Version::Version(VersionSet* vset, uint64_t version_number) + : vset_(vset), + internal_comparator_(&(vset->icmp_)), + user_comparator_(internal_comparator_->user_comparator()), + table_cache_(vset->table_cache_), + merge_operator_(vset->options_->merge_operator.get()), + info_log_(vset->options_->info_log.get()), + db_statistics_(vset->options_->statistics.get()), + next_(this), + prev_(this), + refs_(0), + num_levels_(vset->num_levels_), + files_(new std::vector[num_levels_]), + files_by_size_(num_levels_), + next_file_to_compact_by_size_(num_levels_), + file_to_compact_(nullptr), + file_to_compact_level_(-1), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + version_number_(version_number) {} + +void Version::Get(const ReadOptions& options, + const LookupKey& k, + std::string* value, + Status* status, + MergeContext* merge_context, + GetStats* stats, + bool* value_found) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + + assert(status->ok() || status->IsMergeInProgress()); + Saver saver; + saver.state = status->ok()? kNotFound : kMerge; + saver.ucmp = user_comparator_; + saver.user_key = user_key; + saver.value_found = value_found; + saver.value = value; + saver.merge_operator = merge_operator_; + saver.merge_context = merge_context; + saver.logger = info_log_; + saver.didIO = false; + saver.statistics = db_statistics_; + + stats->seek_file = nullptr; + stats->seek_file_level = -1; + FileMetaData* last_file_read = nullptr; + int last_file_read_level = -1; + + // We can search level-by-level since entries never hop across + // levels. Therefore we are guaranteed that if we find data + // in an smaller level, later levels are irrelevant (unless we + // are MergeInProgress). + for (int level = 0; level < num_levels_; level++) { + size_t num_files = files_[level].size(); + if (num_files == 0) continue; + + // Get the list of files to search in this level + FileMetaData* const* files = &files_[level][0]; + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, + // this can occur at any level. Otherwise, it only occurs + // at Level-0 (since Put/Deletes are always compacted into a single entry). + uint32_t start_index; + if (level == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. + // Binary search to find earliest index whose largest key >= ikey. + // We will also stop when the file no longer overlaps ikey + start_index = FindFile(*internal_comparator_, files_[level], ikey); + } + + // Traverse each relevant file to find the desired key +#ifndef NDEBUG + FileMetaData* prev_file = nullptr; +#endif + for (uint32_t i = start_index; i < num_files; ++i) { + FileMetaData* f = files[i]; + // Skip key range filtering for levle 0 if there are few level 0 files. + if ((level > 0 || num_files > 2) && + (user_comparator_->Compare(user_key, f->smallest.user_key()) < 0 || + user_comparator_->Compare(user_key, f->largest.user_key()) > 0)) { + // Only process overlapping files. + if (level > 0) { + // If on Level-n (n>=1) then the files are sorted. + // So we can stop looking when we are past the ikey. + break; + } + // TODO: do we want to check file ranges for level0 files at all? + // For new SST format where Get() is fast, we might want to consider + // to avoid those two comparisons, if it can filter out too few files. + continue; + } +#ifndef NDEBUG + // Sanity check to make sure that the files are correctly sorted + if (prev_file) { + if (level != 0) { + int comp_sign = + internal_comparator_->Compare(prev_file->largest, f->smallest); + assert(comp_sign < 0); + } else { + // level == 0, the current file cannot be newer than the previous one. + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + assert(!NewestFirstBySeqNo(f, prev_file)); + } else { + assert(!NewestFirst(f, prev_file)); + } + } + } + prev_file = f; +#endif + bool tableIO = false; + *status = table_cache_->Get(options, *internal_comparator_, *f, ikey, + &saver, SaveValue, &tableIO, MarkKeyMayExist); + // TODO: examine the behavior for corrupted key + if (!status->ok()) { + return; + } + + if (last_file_read != nullptr && stats->seek_file == nullptr) { + // We have had more than one seek for this read. Charge the 1st file. + stats->seek_file = last_file_read; + stats->seek_file_level = last_file_read_level; + } + + // If we did any IO as part of the read, then we remember it because + // it is a possible candidate for seek-based compaction. saver.didIO + // is true if the block had to be read in from storage and was not + // pre-exisiting in the block cache. Also, if this file was not pre- + // existing in the table cache and had to be freshly opened that needed + // the index blocks to be read-in, then tableIO is true. One thing + // to note is that the index blocks are not part of the block cache. + if (saver.didIO || tableIO) { + last_file_read = f; + last_file_read_level = level; + } + + switch (saver.state) { + case kNotFound: + break; // Keep searching in other files + case kFound: + return; + case kDeleted: + *status = Status::NotFound(); // Use empty error message for speed + return; + case kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case kMerge: + break; + } + } + } + + + if (kMerge == saver.state) { + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + if (merge_operator_->FullMerge(user_key, nullptr, + saver.merge_context->GetOperands(), value, + info_log_)) { + *status = Status::OK(); + } else { + RecordTick(db_statistics_, NUMBER_MERGE_FAILURES); + *status = Status::Corruption("could not perform end-of-key merge for ", + user_key); + } + } else { + *status = Status::NotFound(); // Use an empty error message for speed + } +} + +bool Version::UpdateStats(const GetStats& stats) { + FileMetaData* f = stats.seek_file; + if (f != nullptr) { + f->allowed_seeks--; + if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) { + file_to_compact_ = f; + file_to_compact_level_ = stats.seek_file_level; + return true; + } + } + return false; +} + +void Version::ComputeCompactionScore( + std::vector& size_being_compacted) { + double max_score = 0; + int max_score_level = 0; + + int num_levels_to_check = + (vset_->options_->compaction_style != kCompactionStyleUniversal) + ? NumberLevels() - 1 + : 1; + + for (int level = 0; level < num_levels_to_check; level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int numfiles = 0; + for (unsigned int i = 0; i < files_[level].size(); i++) { + if (!files_[level][i]->being_compacted) { + numfiles++; + } + } + + // If we are slowing down writes, then we better compact that first + if (numfiles >= vset_->options_->level0_stop_writes_trigger) { + score = 1000000; + // Log(options_->info_log, "XXX score l0 = 1000000000 max"); + } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) { + score = 10000; + // Log(options_->info_log, "XXX score l0 = 1000000 medium"); + } else { + score = static_cast(numfiles) / + vset_->options_->level0_file_num_compaction_trigger; + if (score >= 1) { + // Log(options_->info_log, "XXX score l0 = %d least", (int)score); + } + } + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = + TotalFileSize(files_[level]) - size_being_compacted[level]; + score = static_cast(level_bytes) / vset_->MaxBytesForLevel(level); + if (score > 1) { + // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); + } + if (max_score < score) { + max_score = score; + max_score_level = level; + } + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // update the max compaction score in levels 1 to n-1 + max_compaction_score_ = max_score; + max_compaction_score_level_ = max_score_level; + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < NumberLevels() - 2; i++) { + for (int j = i + 1; j < NumberLevels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } +} + +namespace { + +// Compator that is used to sort files based on their size +// In normal mode: descending size +bool CompareSizeDescending(const Version::Fsize& first, + const Version::Fsize& second) { + return (first.file->file_size > second.file->file_size); +} +// A static compator used to sort files based on their seqno +// In universal style : descending seqno +bool CompareSeqnoDescending(const Version::Fsize& first, + const Version::Fsize& second) { + if (first.file->smallest_seqno > second.file->smallest_seqno) { + assert(first.file->largest_seqno > second.file->largest_seqno); + return true; + } + assert(first.file->largest_seqno <= second.file->largest_seqno); + return false; +} + +} // anonymous namespace + +void Version::UpdateFilesBySize() { + // No need to sort the highest level because it is never compacted. + int max_level = + (vset_->options_->compaction_style == kCompactionStyleUniversal) + ? NumberLevels() + : NumberLevels() - 1; + + for (int level = 0; level < max_level; level++) { + const std::vector& files = files_[level]; + std::vector& files_by_size = files_by_size_[level]; + assert(files_by_size.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (unsigned int i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + int num = temp.size(); + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSeqnoDescending); + } else { + int num = Version::number_of_files_to_sort_; + if (num > (int)temp.size()) { + num = temp.size(); + } + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSizeDescending); + } + assert(temp.size() == files.size()); + + // initialize files_by_size_ + for (unsigned int i = 0; i < temp.size(); i++) { + files_by_size.push_back(temp[i].index); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_size_[level].size()); + } +} + +void Version::Ref() { + ++refs_; +} + +bool Version::Unref() { + assert(this != &vset_->dummy_versions_); + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + delete this; + return true; + } + return false; +} + +bool Version::NeedsCompaction() const { + if (file_to_compact_ != nullptr) { + return true; + } + // In universal compaction case, this check doesn't really + // check the compaction condition, but checks num of files threshold + // only. We are not going to miss any compaction opportunity + // but it's likely that more compactions are scheduled but + // ending up with nothing to do. We can improve it later. + // TODO(sdong): improve this function to be accurate for universal + // compactions. + int num_levels_to_check = + (vset_->options_->compaction_style != kCompactionStyleUniversal) ? + NumberLevels() - 1 : 1; + for (int i = 0; i < num_levels_to_check; i++) { + if (compaction_score_[i] >= 1) { + return true; + } + } + return false; +} + +bool Version::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], + smallest_user_key, largest_user_key); +} + +int Version::PickLevelForMemTableOutput( + const Slice& smallest_user_key, + const Slice& largest_user_key) { + int level = 0; + if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { + // Push to next level if there is no overlap in next level, + // and the #bytes overlapping in the level after that are limited. + InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey limit(largest_user_key, 0, static_cast(0)); + std::vector overlaps; + int max_mem_compact_level = vset_->options_->max_mem_compaction_level; + while (max_mem_compact_level > 0 && level < max_mem_compact_level) { + if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { + break; + } + if (level + 2 >= num_levels_) { + level++; + break; + } + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > vset_->compaction_picker_->MaxGrandParentOverlapBytes(level)) { + break; + } + level++; + } + } + + return level; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// If hint_index is specified, then it points to a file in the +// overlapping range. +// The file_index returns a pointer to any file in an overlapping range. +void Version::GetOverlappingInputs( + int level, + const InternalKey* begin, + const InternalKey* end, + std::vector* inputs, + int hint_index, + int* file_index) { + inputs->clear(); + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + if (file_index) { + *file_index = -1; + } + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + if (begin != nullptr && end != nullptr && level > 0) { + GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs, + hint_index, file_index); + return; + } + for (size_t i = 0; i < files_[level].size(); ) { + FileMetaData* f = files_[level][i++]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + } else { + inputs->push_back(f); + if (level == 0) { + // Level-0 files may overlap each other. So check if the newly + // added file has expanded the range. If so, restart search. + if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { + user_begin = file_start; + inputs->clear(); + i = 0; + } else if (end != nullptr + && user_cmp->Compare(file_limit, user_end) > 0) { + user_end = file_limit; + inputs->clear(); + i = 0; + } + } else if (file_index) { + *file_index = i-1; + } + } + } +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// Employ binary search to find at least one file that overlaps the +// specified range. From that file, iterate backwards and +// forwards to find all overlapping files. +void Version::GetOverlappingInputsBinarySearch( + int level, + const Slice& user_begin, + const Slice& user_end, + std::vector* inputs, + int hint_index, + int* file_index) { + assert(level > 0); + int min = 0; + int mid = 0; + int max = files_[level].size() -1; + bool foundOverlap = false; + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + + // if the caller already knows the index of a file that has overlap, + // then we can skip the binary search. + if (hint_index != -1) { + mid = hint_index; + foundOverlap = true; + } + + while (!foundOverlap && min <= max) { + mid = (min + max)/2; + FileMetaData* f = files_[level][mid]; + const Slice file_start = f->smallest.user_key(); + const Slice file_limit = f->largest.user_key(); + if (user_cmp->Compare(file_limit, user_begin) < 0) { + min = mid + 1; + } else if (user_cmp->Compare(user_end, file_start) < 0) { + max = mid - 1; + } else { + foundOverlap = true; + break; + } + } + + // If there were no overlapping files, return immediately. + if (!foundOverlap) { + return; + } + // returns the index where an overlap is found + if (file_index) { + *file_index = mid; + } + ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// The midIndex specifies the index of at least one file that +// overlaps the specified range. From that file, iterate backward +// and forward to find all overlapping files. +void Version::ExtendOverlappingInputs( + int level, + const Slice& user_begin, + const Slice& user_end, + std::vector* inputs, + unsigned int midIndex) { + + const Comparator* user_cmp = vset_->icmp_.user_comparator(); +#ifndef NDEBUG + { + // assert that the file at midIndex overlaps with the range + assert(midIndex < files_[level].size()); + FileMetaData* f = files_[level][midIndex]; + const Slice fstart = f->smallest.user_key(); + const Slice flimit = f->largest.user_key(); + if (user_cmp->Compare(fstart, user_begin) >= 0) { + assert(user_cmp->Compare(fstart, user_end) <= 0); + } else { + assert(user_cmp->Compare(flimit, user_begin) >= 0); + } + } +#endif + int startIndex = midIndex + 1; + int endIndex = midIndex; + int count __attribute__((unused)) = 0; + + // check backwards from 'mid' to lower indices + for (int i = midIndex; i >= 0 ; i--) { + FileMetaData* f = files_[level][i]; + const Slice file_limit = f->largest.user_key(); + if (user_cmp->Compare(file_limit, user_begin) >= 0) { + startIndex = i; + assert((count++, true)); + } else { + break; + } + } + // check forward from 'mid+1' to higher indices + for (unsigned int i = midIndex+1; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + const Slice file_start = f->smallest.user_key(); + if (user_cmp->Compare(file_start, user_end) <= 0) { + assert((count++, true)); + endIndex = i; + } else { + break; + } + } + assert(count == endIndex - startIndex + 1); + + // insert overlapping files into vector + for (int i = startIndex; i <= endIndex; i++) { + FileMetaData* f = files_[level][i]; + inputs->push_back(f); + } +} + +// Returns true iff the first or last file in inputs contains +// an overlapping user key to the file "just outside" of it (i.e. +// just after the last file, or just before the first file) +// REQUIRES: "*inputs" is a sorted list of non-overlapping files +bool Version::HasOverlappingUserKey( + const std::vector* inputs, + int level) { + + // If inputs empty, there is no overlap. + // If level == 0, it is assumed that all needed files were already included. + if (inputs->empty() || level == 0){ + return false; + } + + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + const std::vector& files = files_[level]; + const size_t kNumFiles = files.size(); + + // Check the last file in inputs against the file after it + size_t last_file = FindFile(vset_->icmp_, files, + inputs->back()->largest.Encode()); + assert(0 <= last_file && last_file < kNumFiles); // File should exist! + if (last_file < kNumFiles-1) { // If not the last file + const Slice last_key_in_input = files[last_file]->largest.user_key(); + const Slice first_key_after = files[last_file+1]->smallest.user_key(); + if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) { + // The last user key in input overlaps with the next file's first key + return true; + } + } + + // Check the first file in inputs against the file just before it + size_t first_file = FindFile(vset_->icmp_, files, + inputs->front()->smallest.Encode()); + assert(0 <= first_file && first_file <= last_file); // File should exist! + if (first_file > 0) { // If not first file + const Slice& first_key_in_input = files[first_file]->smallest.user_key(); + const Slice& last_key_before = files[first_file-1]->largest.user_key(); + if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) { + // The first user key in input overlaps with the previous file's last key + return true; + } + } + + return false; +} + +int64_t Version::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return TotalFileSize(files_[level]); +} + +const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); + for (int i = 0; i < NumberLevels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +const char* Version::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, + "#%lu(seq=%lu,sz=%lu,%lu) ", + (unsigned long)f->number, + (unsigned long)f->smallest_seqno, + (unsigned long)f->file_size, + (unsigned long)f->being_compacted); + if (ret < 0 || ret >= sz) + break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +int64_t Version::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < NumberLevels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +void Version::AddLiveFiles(std::set* live) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = files_[level]; + for (const auto& file : files) { + live->insert(file->number); + } + } +} + +std::string Version::DebugString(bool hex) const { + std::string r; + for (int level = 0; level < num_levels_; level++) { + // E.g., + // --- level 1 --- + // 17:123['a' .. 'd'] + // 20:43['e' .. 'g'] + r.append("--- level "); + AppendNumberTo(&r, level); + r.append(" --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + const std::vector& files = files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->number); + r.push_back(':'); + AppendNumberTo(&r, files[i]->file_size); + r.append("["); + r.append(files[i]->smallest.DebugString(hex)); + r.append(" .. "); + r.append(files[i]->largest.DebugString(hex)); + r.append("]\n"); + } + } + return r; +} + +// this is used to batch writes to the manifest file +struct VersionSet::ManifestWriter { + Status status; + bool done; + port::CondVar cv; + VersionEdit* edit; + + explicit ManifestWriter(port::Mutex* mu, VersionEdit* e) : + done(false), cv(mu), edit(e) {} +}; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionSet::Builder { + private: + // Helper to sort v->files_ + // kLevel0LevelCompaction -- NewestFirst + // kLevel0UniversalCompaction -- NewestFirstBySeqNo + // kLevelNon0 -- BySmallestKey + struct FileComparator { + enum SortMethod { + kLevel0LevelCompaction = 0, + kLevel0UniversalCompaction = 1, + kLevelNon0 = 2, + } sort_method; + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + switch (sort_method) { + case kLevel0LevelCompaction: + return NewestFirst(f1, f2); + case kLevel0UniversalCompaction: + return NewestFirstBySeqNo(f1, f2); + case kLevelNon0: + return BySmallestKey(f1, f2, internal_comparator); + } + assert(false); + return false; + } + }; + + typedef std::set FileSet; + struct LevelState { + std::set deleted_files; + FileSet* added_files; + }; + + VersionSet* vset_; + Version* base_; + LevelState* levels_; + FileComparator level_zero_cmp_; + FileComparator level_nonzero_cmp_; + + public: + // Initialize a builder with the files from *base and other info from *vset + Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) { + base_->Ref(); + levels_ = new LevelState[base->NumberLevels()]; + level_zero_cmp_.sort_method = + (vset_->options_->compaction_style == kCompactionStyleUniversal) + ? FileComparator::kLevel0UniversalCompaction + : FileComparator::kLevel0LevelCompaction; + level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; + level_nonzero_cmp_.internal_comparator = &vset_->icmp_; + + levels_[0].added_files = new FileSet(level_zero_cmp_); + for (int level = 1; level < base->NumberLevels(); level++) { + levels_[level].added_files = new FileSet(level_nonzero_cmp_); + } + } + + ~Builder() { + for (int level = 0; level < base_->NumberLevels(); level++) { + const FileSet* added = levels_[level].added_files; + std::vector to_unref; + to_unref.reserve(added->size()); + for (FileSet::const_iterator it = added->begin(); + it != added->end(); ++it) { + to_unref.push_back(*it); + } + delete added; + for (uint32_t i = 0; i < to_unref.size(); i++) { + FileMetaData* f = to_unref[i]; + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + vset_->table_cache_->ReleaseHandle( + f->table_reader_handle); + f->table_reader_handle = nullptr; + } + delete f; + } + } + } + + delete[] levels_; + base_->Unref(); + } + + void CheckConsistency(Version* v) { +#ifndef NDEBUG + // make sure the files are sorted correctly + for (int level = 0; level < v->NumberLevels(); level++) { + for (size_t i = 1; i < v->files_[level].size(); i++) { + auto f1 = v->files_[level][i - 1]; + auto f2 = v->files_[level][i]; + if (level == 0) { + assert(level_zero_cmp_(f1, f2)); + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + assert(f1->largest_seqno > f2->largest_seqno); + } + } else { + assert(level_nonzero_cmp_(f1, f2)); + + // Make sure there is no overlap in levels > 0 + if (vset_->icmp_.Compare(f1->largest, f2->smallest) >= 0) { + fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", + (f1->largest).DebugString().c_str(), + (f2->smallest).DebugString().c_str()); + abort(); + } + } + } + } +#endif + } + + void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number, + int level) { +#ifndef NDEBUG + // a file to be deleted better exist in the previous version + bool found = false; + for (int l = 0; !found && l < base_->NumberLevels(); l++) { + const std::vector& base_files = base_->files_[l]; + for (unsigned int i = 0; i < base_files.size(); i++) { + FileMetaData* f = base_files[i]; + if (f->number == number) { + found = true; + break; + } + } + } + // if the file did not exist in the previous version, then it + // is possibly moved from lower level to higher level in current + // version + for (int l = level+1; !found && l < base_->NumberLevels(); l++) { + const FileSet* added = levels_[l].added_files; + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); ++added_iter) { + FileMetaData* f = *added_iter; + if (f->number == number) { + found = true; + break; + } + } + } + + // maybe this file was added in a previous edit that was Applied + if (!found) { + const FileSet* added = levels_[level].added_files; + for (FileSet::const_iterator added_iter = added->begin(); + added_iter != added->end(); ++added_iter) { + FileMetaData* f = *added_iter; + if (f->number == number) { + found = true; + break; + } + } + } + assert(found); +#endif + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + CheckConsistency(base_); + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->deleted_files_; + for (const auto& del_file : del) { + const auto level = del_file.first; + const auto number = del_file.second; + levels_[level].deleted_files.insert(number); + CheckConsistencyForDeletes(edit, number, level); + } + + // Add new files + for (const auto& new_file : edit->new_files_) { + const int level = new_file.first; + FileMetaData* f = new FileMetaData(new_file.second); + f->refs = 1; + + // We arrange to automatically compact this file after + // a certain number of seeks. Let's assume: + // (1) One seek costs 10ms + // (2) Writing or reading 1MB costs 10ms (100MB/s) + // (3) A compaction of 1MB does 25MB of IO: + // 1MB read from this level + // 10-12MB read from next level (boundaries may be misaligned) + // 10-12MB written to next level + // This implies that 25 seeks cost the same as the compaction + // of 1MB of data. I.e., one seek costs approximately the + // same as the compaction of 40KB of data. We are a little + // conservative and allow approximately one seek for every 16KB + // of data before triggering a compaction. + f->allowed_seeks = (f->file_size / 16384); + if (f->allowed_seeks < 100) f->allowed_seeks = 100; + + levels_[level].deleted_files.erase(f->number); + levels_[level].added_files->insert(f); + } + } + + // Save the current state in *v. + void SaveTo(Version* v) { + CheckConsistency(base_); + CheckConsistency(v); + for (int level = 0; level < base_->NumberLevels(); level++) { + const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *v. + const auto& base_files = base_->files_[level]; + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + const auto& added_files = *levels_[level].added_files; + v->files_[level].reserve(base_files.size() + added_files.size()); + + for (const auto& added : added_files) { + // Add all smaller files listed in base_ + for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp); + base_iter != bpos; + ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + + MaybeAddFile(v, level, added); + } + + // Add remaining base files + for (; base_iter != base_end; ++base_iter) { + MaybeAddFile(v, level, *base_iter); + } + } + + CheckConsistency(v); + } + + void LoadTableHandlers() { + for (int level = 0; level < vset_->NumberLevels(); level++) { + for (auto& file_meta : *(levels_[level].added_files)) { + assert (!file_meta->table_reader_handle); + bool table_io; + vset_->table_cache_->FindTable(vset_->storage_options_, vset_->icmp_, + file_meta->number, file_meta->file_size, + &file_meta->table_reader_handle, + &table_io, false); + if (file_meta->table_reader_handle != nullptr) { + // Load table_reader + file_meta->table_reader = + vset_->table_cache_->GetTableReaderFromHandle( + file_meta->table_reader_handle); + } + } + } + } + + void MaybeAddFile(Version* v, int level, FileMetaData* f) { + if (levels_[level].deleted_files.count(f->number) > 0) { + // File is deleted: do nothing + } else { + auto* files = &v->files_[level]; + if (level > 0 && !files->empty()) { + // Must not overlap + assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest, + f->smallest) < 0); + } + f->refs++; + files->push_back(f); + } + } +}; + +VersionSet::VersionSet(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, + TableCache* table_cache, + const InternalKeyComparator* cmp) + : env_(options->env), + dbname_(dbname), + options_(options), + table_cache_(table_cache), + icmp_(*cmp), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + pending_manifest_file_number_(0), + last_sequence_(0), + log_number_(0), + prev_log_number_(0), + num_levels_(options_->num_levels), + dummy_versions_(this), + current_(nullptr), + need_slowdown_for_num_level0_files_(false), + current_version_number_(0), + manifest_file_size_(0), + storage_options_(storage_options), + storage_options_compactions_(storage_options_) { + if (options_->compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset(new UniversalCompactionPicker(options_, &icmp_)); + } else { + compaction_picker_.reset(new LevelCompactionPicker(options_, &icmp_)); + } + AppendVersion(new Version(this, current_version_number_++)); +} + +VersionSet::~VersionSet() { + current_->Unref(); + assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty + for (auto file : obsolete_files_) { + delete file; + } + obsolete_files_.clear(); +} + +void VersionSet::AppendVersion(Version* v) { + // Make "v" current + assert(v->refs_ == 0); + assert(v != current_); + if (current_ != nullptr) { + assert(current_->refs_ > 0); + current_->Unref(); + } + current_ = v; + need_slowdown_for_num_level0_files_ = + (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr && + v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger); + v->Ref(); + + // Append to linked list + v->prev_ = dummy_versions_.prev_; + v->next_ = &dummy_versions_; + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, + Directory* db_directory, + bool new_descriptor_log) { + mu->AssertHeld(); + + // queue our request + ManifestWriter w(mu, edit); + manifest_writers_.push_back(&w); + while (!w.done && &w != manifest_writers_.front()) { + w.cv.Wait(); + } + if (w.done) { + return w.status; + } + + std::vector batch_edits; + Version* v = new Version(this, current_version_number_++); + Builder builder(this, current_); + + // process all requests in the queue + ManifestWriter* last_writer = &w; + assert(!manifest_writers_.empty()); + assert(manifest_writers_.front() == &w); + + uint64_t max_log_number_in_batch = 0; + for (const auto& writer : manifest_writers_) { + last_writer = writer; + LogAndApplyHelper(&builder, v, writer->edit, mu); + if (writer->edit->has_log_number_) { + // When batch commit of manifest writes, we could have multiple flush and + // compaction edits. A flush edit has a bigger log number than what + // VersionSet has while a compaction edit does not have a log number. + // In this case, we want to make sure the largest log number is updated + // to VersionSet + max_log_number_in_batch = + std::max(max_log_number_in_batch, writer->edit->log_number_); + } + batch_edits.push_back(writer->edit); + } + builder.SaveTo(v); + + // Initialize new descriptor log file if necessary by creating + // a temporary file that contains a snapshot of the current version. + uint64_t new_manifest_file_size = 0; + Status s; + + assert(pending_manifest_file_number_ == 0); + if (!descriptor_log_ || + manifest_file_size_ > options_->max_manifest_file_size) { + pending_manifest_file_number_ = NewFileNumber(); + batch_edits.back()->SetNextFile(next_file_number_); + new_descriptor_log = true; + } else { + pending_manifest_file_number_ = manifest_file_number_; + } + + // Unlock during expensive operations. New writes cannot get here + // because &w is ensuring that all new writes get queued. + { + // calculate the amount of data being compacted at every level + std::vector size_being_compacted(v->NumberLevels() - 1); + compaction_picker_->SizeBeingCompacted(size_being_compacted); + + mu->Unlock(); + + if (options_->max_open_files == -1) { + // unlimited table cache. Pre-load table handle now. + // Need to do it out of the mutex. + builder.LoadTableHandlers(); + } + + // This is fine because everything inside of this block is serialized -- + // only one thread can be here at the same time + if (new_descriptor_log) { + unique_ptr descriptor_file; + s = env_->NewWritableFile( + DescriptorFileName(dbname_, pending_manifest_file_number_), + &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); + if (s.ok()) { + descriptor_file->SetPreallocationBlockSize( + options_->manifest_preallocation_size); + descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); + s = WriteSnapshot(descriptor_log_.get()); + } + } + + // The calls to ComputeCompactionScore and UpdateFilesBySize are cpu-heavy + // and is best called outside the mutex. + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + + // Write new record to MANIFEST log + if (s.ok()) { + for (auto& e : batch_edits) { + std::string record; + e->EncodeTo(&record); + s = descriptor_log_->AddRecord(record); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + if (options_->use_fsync) { + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); + s = descriptor_log_->file()->Fsync(); + } else { + StopWatch sw(env_, options_->statistics.get(), + MANIFEST_FILE_SYNC_MICROS); + s = descriptor_log_->file()->Sync(); + } + } + if (!s.ok()) { + Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); + bool all_records_in = true; + for (auto& e : batch_edits) { + std::string record; + e->EncodeTo(&record); + if (!ManifestContains(pending_manifest_file_number_, record)) { + all_records_in = false; + break; + } + } + if (all_records_in) { + Log(options_->info_log, + "MANIFEST contains log record despite error; advancing to new " + "version to prevent mismatch between in-memory and logged state" + " If paranoid is set, then the db is now in readonly mode."); + s = Status::OK(); + } + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && new_descriptor_log) { + s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_); + if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { + // delete old manifest file + Log(options_->info_log, + "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", + manifest_file_number_, pending_manifest_file_number_); + // we don't care about an error here, PurgeObsoleteFiles will take care + // of it later + env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_)); + } + if (!options_->disableDataSync && db_directory != nullptr) { + db_directory->Fsync(); + } + } + + if (s.ok()) { + // find offset in manifest file where this version is stored. + new_manifest_file_size = descriptor_log_->file()->GetFileSize(); + } + + LogFlush(options_->info_log); + mu->Lock(); + } + + // Install the new version + if (s.ok()) { + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + AppendVersion(v); + if (max_log_number_in_batch != 0) { + assert(log_number_ < max_log_number_in_batch); + log_number_ = max_log_number_in_batch; + } + prev_log_number_ = edit->prev_log_number_; + } else { + Log(options_->info_log, "Error in committing version %lu", + (unsigned long)v->GetVersionNumber()); + delete v; + if (new_descriptor_log) { + descriptor_log_.reset(); + env_->DeleteFile( + DescriptorFileName(dbname_, pending_manifest_file_number_)); + } + } + pending_manifest_file_number_ = 0; + + // wake up all the waiting writers + while (true) { + ManifestWriter* ready = manifest_writers_.front(); + manifest_writers_.pop_front(); + if (ready != &w) { + ready->status = s; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + // Notify new head of write queue + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return s; +} + +void VersionSet::LogAndApplyHelper(Builder* builder, Version* v, + VersionEdit* edit, port::Mutex* mu) { + mu->AssertHeld(); + + if (edit->has_log_number_) { + assert(edit->log_number_ >= log_number_); + assert(edit->log_number_ < next_file_number_); + } + // If the edit does not have log number, it must be generated + // from a compaction + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + + builder->Apply(edit); +} + +Status VersionSet::Recover() { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_filename; + Status s = ReadFileToString( + env_, CurrentFileName(dbname_), &manifest_filename + ); + if (!s.ok()) { + return s; + } + if (manifest_filename.empty() || + manifest_filename.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + // remove the trailing '\n' + manifest_filename.resize(manifest_filename.size() - 1); + + Log(options_->info_log, "Recovering from manifest file:%s\n", + manifest_filename.c_str()); + + manifest_filename = dbname_ + "/" + manifest_filename; + unique_ptr manifest_file; + s = env_->NewSequentialFile( + manifest_filename, &manifest_file, storage_options_ + ); + if (!s.ok()) { + return s; + } + uint64_t manifest_file_size; + s = env_->GetFileSize(manifest_filename, &manifest_file_size); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + if (edit.max_level_ >= current_->NumberLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument(icmp_.user_comparator()->Name(), + "does not match existing comparator " + + edit.comparator_); + break; + } + + builder.Apply(&edit); + + // Only a flush's edit or a new snapshot can write log number during + // LogAndApply. Since memtables are flushed and inserted into + // manifest_writers_ queue in order, the log number in MANIFEST file + // should be monotonically increasing. + if (edit.has_log_number_) { + if (have_log_number && log_number >= edit.log_number_) { + Log(options_->info_log, + "decreasing of log_number is detected " + "in MANIFEST\n"); + } else { + log_number = edit.log_number_; + have_log_number = true; + } + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + + MarkFileNumberUsed(prev_log_number); + MarkFileNumberUsed(log_number); + } + + if (s.ok()) { + if (options_->max_open_files == -1) { + // unlimited table cache. Pre-load table handle now. + // Need to do it out of the mutex. + builder.LoadTableHandlers(); + } + + Version* v = new Version(this, current_version_number_++); + builder.SaveTo(v); + + // Install recovered version + std::vector size_being_compacted(v->NumberLevels() - 1); + compaction_picker_->SizeBeingCompacted(size_being_compacted); + v->ComputeCompactionScore(size_being_compacted); + + manifest_file_size_ = manifest_file_size; + AppendVersion(v); + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + + Log(options_->info_log, "Recovered from manifest file:%s succeeded," + "manifest_file_number is %lu, next_file_number is %lu, " + "last_sequence is %lu, log_number is %lu," + "prev_log_number is %lu\n", + manifest_filename.c_str(), + (unsigned long)manifest_file_number_, + (unsigned long)next_file_number_, + (unsigned long)last_sequence_, + (unsigned long)log_number_, + (unsigned long)prev_log_number_); + } + + return s; +} + +Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const EnvOptions& storage_options, + int new_levels) { + if (new_levels <= 1) { + return Status::InvalidArgument( + "Number of levels needs to be bigger than 1"); + } + + const InternalKeyComparator cmp(options->comparator); + TableCache tc(dbname, options, storage_options, 10); + VersionSet versions(dbname, options, storage_options, &tc, &cmp); + Status status; + + status = versions.Recover(); + if (!status.ok()) { + return status; + } + + Version* current_version = versions.current(); + int current_levels = current_version->NumberLevels(); + + if (current_levels <= new_levels) { + return Status::OK(); + } + + // Make sure there are file only on one level from + // (new_levels-1) to (current_levels-1) + int first_nonempty_level = -1; + int first_nonempty_level_filenum = 0; + for (int i = new_levels - 1; i < current_levels; i++) { + int file_num = current_version->NumLevelFiles(i); + if (file_num != 0) { + if (first_nonempty_level < 0) { + first_nonempty_level = i; + first_nonempty_level_filenum = file_num; + } else { + char msg[255]; + snprintf(msg, sizeof(msg), + "Found at least two levels containing files: " + "[%d:%d],[%d:%d].\n", + first_nonempty_level, first_nonempty_level_filenum, i, + file_num); + return Status::InvalidArgument(msg); + } + } + } + + std::vector* old_files_list = current_version->files_; + // we need to allocate an array with the old number of levels size to + // avoid SIGSEGV in WriteSnapshot() + // however, all levels bigger or equal to new_levels will be empty + std::vector* new_files_list = + new std::vector[current_levels]; + for (int i = 0; i < new_levels - 1; i++) { + new_files_list[i] = old_files_list[i]; + } + + if (first_nonempty_level > 0) { + new_files_list[new_levels - 1] = old_files_list[first_nonempty_level]; + } + + delete[] current_version->files_; + current_version->files_ = new_files_list; + current_version->num_levels_ = new_levels; + + VersionEdit ve; + port::Mutex dummy_mutex; + MutexLock l(&dummy_mutex); + return versions.LogAndApply(&ve, &dummy_mutex, nullptr, true); +} + +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex) { + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + + // Open the specified manifest file. + unique_ptr file; + Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); + if (!s.ok()) { + return s; + } + + bool have_log_number = false; + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; + int count = 0; + VersionSet::Builder builder(this, current_); + + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(file), &reporter, true/*checksum*/, + 0/*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (s.ok()) { + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument(icmp_.user_comparator()->Name(), + "does not match existing comparator " + + edit.comparator_); + } + } + + // Write out each individual edit + if (verbose) { + printf("*************************Edit[%d] = %s\n", + count, edit.DebugString(hex).c_str()); + } + count++; + + if (s.ok()) { + builder.Apply(&edit); + } + + if (edit.has_log_number_) { + log_number = edit.log_number_; + have_log_number = true; + } + + if (edit.has_prev_log_number_) { + prev_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + } + } + file.reset(); + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + printf("no meta-nextfile entry in descriptor"); + } else if (!have_log_number) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + printf("no meta-lognumber entry in descriptor"); + } else if (!have_last_sequence) { + printf("no last-sequence-number entry in descriptor"); + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + prev_log_number = 0; + } + + MarkFileNumberUsed(prev_log_number); + MarkFileNumberUsed(log_number); + } + + if (s.ok()) { + Version* v = new Version(this, 0); + builder.SaveTo(v); + + // Install recovered version + std::vector size_being_compacted(v->NumberLevels() - 1); + compaction_picker_->SizeBeingCompacted(size_being_compacted); + v->ComputeCompactionScore(size_being_compacted); + + AppendVersion(v); + manifest_file_number_ = next_file; + next_file_number_ = next_file + 1; + last_sequence_ = last_sequence; + log_number_ = log_number; + prev_log_number_ = prev_log_number; + + printf("manifest_file_number %lu next_file_number %lu last_sequence " + "%lu log_number %lu prev_log_number %lu\n", + (unsigned long)manifest_file_number_, + (unsigned long)next_file_number_, + (unsigned long)last_sequence, + (unsigned long)log_number, + (unsigned long)prev_log_number); + printf("%s \n", v->DebugString(hex).c_str()); + } + + return s; +} + +void VersionSet::MarkFileNumberUsed(uint64_t number) { + if (next_file_number_ <= number) { + next_file_number_ = number + 1; + } +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // Save metadata + VersionEdit edit; + edit.SetComparatorName(icmp_.user_comparator()->Name()); + + // Save files + for (int level = 0; level < current_->NumberLevels(); level++) { + const auto& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + const auto f = files[i]; + edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest, + f->smallest_seqno, f->largest_seqno); + } + } + edit.SetLogNumber(log_number_); + + std::string record; + edit.EncodeTo(&record); + return log->AddRecord(record); +} + +// Opens the mainfest file and reads all records +// till it finds the record we are looking for. +bool VersionSet::ManifestContains(uint64_t manifest_file_number, + const std::string& record) const { + std::string fname = + DescriptorFileName(dbname_, manifest_file_number); + Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); + unique_ptr file; + Status s = env_->NewSequentialFile(fname, &file, storage_options_); + if (!s.ok()) { + Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); + Log(options_->info_log, + "ManifestContains: is unable to reopen the manifest file %s", + fname.c_str()); + return false; + } + log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0); + Slice r; + std::string scratch; + bool result = false; + while (reader.ReadRecord(&r, &scratch)) { + if (r == Slice(record)) { + result = true; + break; + } + } + Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); + return result; +} + + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { + uint64_t result = 0; + for (int level = 0; level < v->NumberLevels(); level++) { + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + TableReader* table_reader_ptr; + Iterator* iter = + table_cache_->NewIterator(ReadOptions(), storage_options_, icmp_, + *(files[i]), &table_reader_ptr); + if (table_reader_ptr != nullptr) { + result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + } + return result; +} + +void VersionSet::AddLiveFiles(std::vector* live_list) { + // pre-calculate space requirement + int64_t total_files = 0; + for (Version* v = dummy_versions_.next_; + v != &dummy_versions_; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + total_files += v->files_[level].size(); + } + } + + // just one time extension to the right size + live_list->reserve(live_list->size() + total_files); + + for (Version* v = dummy_versions_.next_; + v != &dummy_versions_; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + for (const auto& f : v->files_[level]) { + live_list->push_back(f->number); + } + } + } +} + +Compaction* VersionSet::PickCompaction(LogBuffer* log_buffer) { + return compaction_picker_->PickCompaction(current_, log_buffer); +} + +Compaction* VersionSet::CompactRange(int input_level, int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + return compaction_picker_->CompactRange(current_, input_level, output_level, + begin, end, compaction_end); +} + +Iterator* VersionSet::MakeInputIterator(Compaction* c) { + ReadOptions options; + options.verify_checksums = options_->verify_checksums_in_compaction; + options.fill_cache = false; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const int space = (c->level() == 0 ? c->inputs(0)->size() + 1 : 2); + Iterator** list = new Iterator*[space]; + int num = 0; + for (int which = 0; which < 2; which++) { + if (!c->inputs(which)->empty()) { + if (c->level() + which == 0) { + for (const auto& file : *c->inputs(which)) { + list[num++] = table_cache_->NewIterator( + options, storage_options_compactions_, icmp_, *file, nullptr, + true /* for compaction */); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = NewTwoLevelIterator( + new Version::LevelFileNumIterator(icmp_, c->inputs(which)), + &GetFileIterator, table_cache_, options, storage_options_, icmp_, + true /* for compaction */); + } + } + } + assert(num <= space); + Iterator* result = NewMergingIterator(env_, &icmp_, list, num); + delete[] list; + return result; +} + +double VersionSet::MaxBytesForLevel(int level) { + return compaction_picker_->MaxBytesForLevel(level); +} + +uint64_t VersionSet::MaxFileSizeForLevel(int level) { + return compaction_picker_->MaxFileSizeForLevel(level); +} + +// verify that the files listed in this compaction are present +// in the current version +bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { +#ifndef NDEBUG + if (c->input_version() != current_) { + Log(options_->info_log, "VerifyCompactionFileConsistency version mismatch"); + } + + // verify files in level + int level = c->level(); + for (int i = 0; i < c->num_input_files(0); i++) { + uint64_t number = c->input(0,i)->number; + + // look for this file in the current version + bool found = false; + for (unsigned int j = 0; j < current_->files_[level].size(); j++) { + FileMetaData* f = current_->files_[level][j]; + if (f->number == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existant in current version + } + } + // verify level+1 files + level++; + for (int i = 0; i < c->num_input_files(1); i++) { + uint64_t number = c->input(1,i)->number; + + // look for this file in the current version + bool found = false; + for (unsigned int j = 0; j < current_->files_[level].size(); j++) { + FileMetaData* f = current_->files_[level][j]; + if (f->number == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existant in current version + } + } +#endif + return true; // everything good +} + +void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) { + compaction_picker_->ReleaseCompactionFiles(c, status); +} + +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** meta) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->number == number) { + *meta = files[i]; + *filelevel = level; + return Status::OK(); + } + } + } + return Status::NotFound("File not present in any level"); +} + +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = current_->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + LiveFileMetaData filemetadata; + filemetadata.name = TableFileName("", files[i]->number); + filemetadata.level = level; + filemetadata.size = files[i]->file_size; + filemetadata.smallestkey = files[i]->smallest.user_key().ToString(); + filemetadata.largestkey = files[i]->largest.user_key().ToString(); + filemetadata.smallest_seqno = files[i]->smallest_seqno; + filemetadata.largest_seqno = files[i]->largest_seqno; + metadata->push_back(filemetadata); + } + } +} + +void VersionSet::GetObsoleteFiles(std::vector* files) { + files->insert(files->end(), + obsolete_files_.begin(), + obsolete_files_.end()); + obsolete_files_.clear(); +} + +} // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h new file mode 100644 index 00000000..38b4bada --- /dev/null +++ b/db/version_set.h @@ -0,0 +1,505 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#pragma once +#include +#include +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/port.h" +#include "db/table_cache.h" +#include "db/compaction.h" +#include "db/compaction_picker.h" + +namespace rocksdb { + +namespace log { class Writer; } + +class Compaction; +class CompactionPicker; +class Iterator; +class LogBuffer; +class LookupKey; +class MemTable; +class MergeContext; +class TableCache; +class Version; +class VersionSet; + +// Return the smallest index i such that files[i]->largest >= key. +// Return files.size() if there is no such file. +// REQUIRES: "files" contains a sorted list of non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const std::vector& files, + const Slice& key); + +// Returns true iff some file in "files" overlaps the user key range +// [*smallest,*largest]. +// smallest==nullptr represents a key smaller than all keys in the DB. +// largest==nullptr represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges +// in sorted order. +extern bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const std::vector& files, + const Slice* smallest_user_key, + const Slice* largest_user_key); + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, const EnvOptions& soptions, + std::vector* iters); + + // Lookup the value for key. If found, store it in *val and + // return OK. Else return a non-OK status. Fills *stats. + // Uses *operands to store merge_operator operations to apply later + // REQUIRES: lock is not held + struct GetStats { + FileMetaData* seek_file; + int seek_file_level; + }; + void Get(const ReadOptions&, const LookupKey& key, std::string* val, + Status* status, MergeContext* merge_context, GetStats* stats, + bool* value_found = nullptr); + + // Adds "stats" into the current state. Returns true if a new + // compaction may need to be triggered, false otherwise. + // REQUIRES: lock is held + bool UpdateStats(const GetStats& stats); + + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // REQUIRES: If Version is not yet saved to current_, it can be called without + // a lock. Once a version is saved to current_, call only with mutex held + void ComputeCompactionScore(std::vector& size_being_compacted); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); + + // Returns true iff some level needs a compaction. + bool NeedsCompaction() const; + + // Returns the maxmimum compaction score for levels 1 to max + double MaxCompactionScore() const { return max_compaction_score_; } + + // See field declaration + int MaxCompactionScoreLevel() const { return max_compaction_score_level_; } + + void GetOverlappingInputs( + int level, + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr); // return index of overlap file + + void GetOverlappingInputsBinarySearch( + int level, + const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys + std::vector* inputs, + int hint_index, // index of overlap file + int* file_index); // return index of overlap file + + void ExtendOverlappingInputs( + int level, + const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys + std::vector* inputs, + unsigned int index); // start extending from this index + + // Returns true iff some file in the specified level overlaps + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. + bool OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Returns true iff the first or last file in inputs contains + // an overlapping user key to the file "just outside" of it (i.e. + // just after the last file, or just before the first file) + // REQUIRES: "*inputs" is a sorted list of non-overlapping files + bool HasOverlappingUserKey(const std::vector* inputs, + int level); + + + // Return the level at which we should place a new memtable compaction + // result that covers the range [smallest_user_key,largest_user_key]. + int PickLevelForMemTableOutput(const Slice& smallest_user_key, + const Slice& largest_user_key); + + int NumberLevels() const { return num_levels_; } + + // REQUIRES: lock is held + int NumLevelFiles(int level) const { return files_[level].size(); } + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[100]; + }; + struct FileSummaryStorage { + char buffer[1000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Add all files listed in the current version to *live. + void AddLiveFiles(std::set* live); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + // Returns the version nuber of this version + uint64_t GetVersionNumber() const { return version_number_; } + + // REQUIRES: lock is held + // On success, *props will be populated with all SSTables' table properties. + // The keys of `props` are the sst file name, the values of `props` are the + // tables' propertis, represented as shared_ptr. + Status GetPropertiesOfAllTables(TablePropertiesCollection* props); + + // used to sort files by size + struct Fsize { + int index; + FileMetaData* file; + }; + + private: + friend class Compaction; + friend class VersionSet; + friend class DBImpl; + friend class CompactionPicker; + friend class LevelCompactionPicker; + friend class UniversalCompactionPicker; + + class LevelFileNumIterator; + Iterator* NewConcatenatingIterator(const ReadOptions&, + const EnvOptions& soptions, + int level) const; + bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, + const Slice& internal_prefix, Iterator* level_iter) const; + + // Sort all files for this version based on their file size and + // record results in files_by_size_. The largest files are listed first. + void UpdateFilesBySize(); + + VersionSet* vset_; // VersionSet to which this Version belongs + const InternalKeyComparator* internal_comparator_; + const Comparator* user_comparator_; + TableCache* table_cache_; + const MergeOperator* merge_operator_; + Logger* info_log_; + Statistics* db_statistics_; + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version + int num_levels_; // Number of levels + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector> files_by_size_; + + // An index into files_by_size_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_size_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const int number_of_files_to_sort_ = 50; + + // Next file to compact based on seek stats. + FileMetaData* file_to_compact_; + int file_to_compact_level_; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + double max_compaction_score_; // max score in l1 to ln-1 + int max_compaction_score_level_; // level on which max score occurs + + // A version number that uniquely represents this version. This is + // used for debugging and logging purposes only. + uint64_t version_number_; + + explicit Version(VersionSet* vset, uint64_t version_number = 0); + + ~Version(); + + // re-initializes the index that is used to offset into files_by_size_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + // No copying allowed + Version(const Version&); + void operator=(const Version&); +}; + +class VersionSet { + public: + VersionSet(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, TableCache* table_cache, + const InternalKeyComparator*); + ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Will release *mu while actually writing to the file. + // REQUIRES: *mu is held on entry. + // REQUIRES: no other thread concurrently calls LogAndApply() + Status LogAndApply(VersionEdit* edit, port::Mutex* mu, + Directory* db_directory = nullptr, + bool new_descriptor_log = false); + + // Recover the last saved descriptor from persistent storage. + Status Recover(); + + // Try to reduce the number of levels. This call is valid when + // only one level from the new max level to the old + // max level containing files. + // The call is static, since number of levels is immutable during + // the lifetime of a RocksDB instance. It reduces number of levels + // in a DB by applying changes to manifest. + // For example, a db currently has 7 levels [0-6], and a call to + // to reduce to 5 [0-4] can only be executed when only one level + // among [4-6] contains files. + static Status ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const EnvOptions& storage_options, + int new_levels); + + // Return the current version. + Version* current() const { return current_; } + + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + + // Return the current manifest file number + uint64_t ManifestFileNumber() const { return manifest_file_number_; } + + uint64_t PendingManifestFileNumber() const { + return pending_manifest_file_number_; + } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_++; } + + // Arrange to reuse "file_number" unless a newer file number has + // already been allocated. + // REQUIRES: "file_number" was returned by a call to NewFileNumber(). + void ReuseFileNumber(uint64_t file_number) { + if (next_file_number_ == file_number + 1) { + next_file_number_ = file_number; + } + } + + // Return the last sequence number. + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + last_sequence_.store(s, std::memory_order_release); + } + + // Mark the specified file number as used. + void MarkFileNumberUsed(uint64_t number); + + // Return the current log file number. + uint64_t LogNumber() const { return log_number_; } + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t PrevLogNumber() const { return prev_log_number_; } + + int NumberLevels() const { return num_levels_; } + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + Compaction* PickCompaction(LogBuffer* log_buffer); + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + Compaction* CompactRange(int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + Iterator* MakeInputIterator(Compaction* c); + + // Add all files listed in any live version to *live. + void AddLiveFiles(std::vector* live_list); + + // Return the approximate offset in the database of the data for + // "key" as of version "v". + uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); + + // printf contents (for debugging) + Status DumpManifest(Options& options, std::string& manifestFileName, + bool verbose, bool hex = false); + + // Return the size of the current manifest file + uint64_t ManifestFileSize() const { return manifest_file_size_; } + + // verify that the files that we started with for a compaction + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + bool VerifyCompactionFileConsistency(Compaction* c); + + double MaxBytesForLevel(int level); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level); + + void ReleaseCompactionFiles(Compaction* c, Status status); + + Status GetMetadataForFile( + uint64_t number, int *filelevel, FileMetaData **metadata); + + void GetLiveFilesMetaData( + std::vector *metadata); + + void GetObsoleteFiles(std::vector* files); + + private: + class Builder; + struct ManifestWriter; + + friend class Compaction; + friend class Version; + + // Save current contents to *log + Status WriteSnapshot(log::Writer* log); + + void AppendVersion(Version* v); + + bool ManifestContains(uint64_t manifest_file_number, + const std::string& record) const; + + Env* const env_; + const std::string dbname_; + const Options* const options_; + TableCache* const table_cache_; + const InternalKeyComparator icmp_; + uint64_t next_file_number_; + uint64_t manifest_file_number_; + uint64_t pending_manifest_file_number_; + std::atomic last_sequence_; + uint64_t log_number_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + int num_levels_; + + // Opened lazily + unique_ptr descriptor_log_; + Version dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions_.prev_ + + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; + + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; + + // generates a increasing version number for every new version + uint64_t current_version_number_; + + // Queue of writers to the manifest file + std::deque manifest_writers_; + + // Current size of manifest file + uint64_t manifest_file_size_; + + std::vector obsolete_files_; + + // storage options for all reads and writes except compactions + const EnvOptions& storage_options_; + + // storage options used for compactions. This is a copy of + // storage_options_ but with readaheads set to readahead_compactions_. + const EnvOptions storage_options_compactions_; + + // No copying allowed + VersionSet(const VersionSet&); + void operator=(const VersionSet&); + + void LogAndApplyHelper(Builder*b, Version* v, + VersionEdit* edit, port::Mutex* mu); +}; + +} // namespace rocksdb diff --git a/db/version_set_test.cc b/db/version_set_test.cc new file mode 100644 index 00000000..1af95dd3 --- /dev/null +++ b/db/version_set_test.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class FindFileTest { + public: + std::vector files_; + bool disjoint_sorted_files_; + + FindFileTest() : disjoint_sorted_files_(true) { } + + ~FindFileTest() { + for (unsigned int i = 0; i < files_.size(); i++) { + delete files_[i]; + } + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + FileMetaData* f = new FileMetaData; + f->number = files_.size() + 1; + f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); + f->largest = InternalKey(largest, largest_seq, kTypeValue); + files_.push_back(f); + } + + int Find(const char* key) { + InternalKey target(key, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return FindFile(cmp, files_, target.Encode()); + } + + bool Overlaps(const char* smallest, const char* largest) { + InternalKeyComparator cmp(BytewiseComparator()); + Slice s(smallest != nullptr ? smallest : ""); + Slice l(largest != nullptr ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, + (smallest != nullptr ? &s : nullptr), + (largest != nullptr ? &l : nullptr)); + } +}; + +TEST(FindFileTest, Empty) { + ASSERT_EQ(0, Find("foo")); + ASSERT_TRUE(! Overlaps("a", "z")); + ASSERT_TRUE(! Overlaps(nullptr, "z")); + ASSERT_TRUE(! Overlaps("a", nullptr)); + ASSERT_TRUE(! Overlaps(nullptr, nullptr)); +} + +TEST(FindFileTest, Single) { + Add("p", "q"); + ASSERT_EQ(0, Find("a")); + ASSERT_EQ(0, Find("p")); + ASSERT_EQ(0, Find("p1")); + ASSERT_EQ(0, Find("q")); + ASSERT_EQ(1, Find("q1")); + ASSERT_EQ(1, Find("z")); + + ASSERT_TRUE(! Overlaps("a", "b")); + ASSERT_TRUE(! Overlaps("z1", "z2")); + ASSERT_TRUE(Overlaps("a", "p")); + ASSERT_TRUE(Overlaps("a", "q")); + ASSERT_TRUE(Overlaps("a", "z")); + ASSERT_TRUE(Overlaps("p", "p1")); + ASSERT_TRUE(Overlaps("p", "q")); + ASSERT_TRUE(Overlaps("p", "z")); + ASSERT_TRUE(Overlaps("p1", "p2")); + ASSERT_TRUE(Overlaps("p1", "z")); + ASSERT_TRUE(Overlaps("q", "q")); + ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(! Overlaps(nullptr, "j")); + ASSERT_TRUE(! Overlaps("r", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "p")); + ASSERT_TRUE(Overlaps(nullptr, "p1")); + ASSERT_TRUE(Overlaps("q", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); +} + + +TEST(FindFileTest, Multiple) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_EQ(0, Find("100")); + ASSERT_EQ(0, Find("150")); + ASSERT_EQ(0, Find("151")); + ASSERT_EQ(0, Find("199")); + ASSERT_EQ(0, Find("200")); + ASSERT_EQ(1, Find("201")); + ASSERT_EQ(1, Find("249")); + ASSERT_EQ(1, Find("250")); + ASSERT_EQ(2, Find("251")); + ASSERT_EQ(2, Find("299")); + ASSERT_EQ(2, Find("300")); + ASSERT_EQ(2, Find("349")); + ASSERT_EQ(2, Find("350")); + ASSERT_EQ(3, Find("351")); + ASSERT_EQ(3, Find("400")); + ASSERT_EQ(3, Find("450")); + ASSERT_EQ(4, Find("451")); + + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("251", "299")); + ASSERT_TRUE(! Overlaps("451", "500")); + ASSERT_TRUE(! Overlaps("351", "399")); + + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); +} + +TEST(FindFileTest, MultipleNullBoundaries) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(! Overlaps(nullptr, "149")); + ASSERT_TRUE(! Overlaps("451", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "150")); + ASSERT_TRUE(Overlaps(nullptr, "199")); + ASSERT_TRUE(Overlaps(nullptr, "200")); + ASSERT_TRUE(Overlaps(nullptr, "201")); + ASSERT_TRUE(Overlaps(nullptr, "400")); + ASSERT_TRUE(Overlaps(nullptr, "800")); + ASSERT_TRUE(Overlaps("100", nullptr)); + ASSERT_TRUE(Overlaps("200", nullptr)); + ASSERT_TRUE(Overlaps("449", nullptr)); + ASSERT_TRUE(Overlaps("450", nullptr)); +} + +TEST(FindFileTest, OverlapSequenceChecks) { + Add("200", "200", 5000, 3000); + ASSERT_TRUE(! Overlaps("199", "199")); + ASSERT_TRUE(! Overlaps("201", "300")); + ASSERT_TRUE(Overlaps("200", "200")); + ASSERT_TRUE(Overlaps("190", "200")); + ASSERT_TRUE(Overlaps("200", "210")); +} + +TEST(FindFileTest, OverlappingFiles) { + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/write_batch.cc b/db/write_batch.cc new file mode 100644 index 00000000..352f5789 --- /dev/null +++ b/db/write_batch.cc @@ -0,0 +1,335 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeMerge varstring varstring +// kTypeDeletion varstring +// varstring := +// len: varint32 +// data: uint8[len] + +#include "rocksdb/write_batch.h" +#include "rocksdb/options.h" +#include "rocksdb/merge_operator.h" +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/memtable.h" +#include "db/snapshot.h" +#include "db/write_batch_internal.h" +#include "util/coding.h" +#include "util/statistics.h" +#include + +namespace rocksdb { + +// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. +static const size_t kHeader = 12; + +WriteBatch::WriteBatch(size_t reserved_bytes) { + rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader); + Clear(); +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { + throw std::runtime_error("Handler::Merge not implemented!"); +} + +void WriteBatch::Handler::LogData(const Slice& blob) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} + +bool WriteBatch::Handler::Continue() { + return true; +} + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(kHeader); +} + +int WriteBatch::Count() const { + return WriteBatchInternal::Count(this); +} + +Status WriteBatch::Iterate(Handler* handler) const { + Slice input(rep_); + if (input.size() < kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + input.remove_prefix(kHeader); + Slice key, value, blob; + int found = 0; + while (!input.empty() && handler->Continue()) { + char tag = input[0]; + input.remove_prefix(1); + switch (tag) { + case kTypeValue: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Put(key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeDeletion: + if (GetLengthPrefixedSlice(&input, &key)) { + handler->Delete(key); + found++; + } else { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeMerge: + if (GetLengthPrefixedSlice(&input, &key) && + GetLengthPrefixedSlice(&input, &value)) { + handler->Merge(key, value); + found++; + } else { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeLogData: + if (GetLengthPrefixedSlice(&input, &blob)) { + handler->LogData(blob); + } else { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (found != WriteBatchInternal::Count(this)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +int WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, int n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +void WriteBatch::Put(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::Put(const SliceParts& key, const SliceParts& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeValue)); + PutLengthPrefixedSliceParts(&rep_, key); + PutLengthPrefixedSliceParts(&rep_, value); +} + +void WriteBatch::Delete(const Slice& key) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeDeletion)); + PutLengthPrefixedSlice(&rep_, key); +} + +void WriteBatch::Merge(const Slice& key, const Slice& value) { + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); + rep_.push_back(static_cast(kTypeMerge)); + PutLengthPrefixedSlice(&rep_, key); + PutLengthPrefixedSlice(&rep_, value); +} + +void WriteBatch::PutLogData(const Slice& blob) { + rep_.push_back(static_cast(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); +} + +namespace { +class MemTableInserter : public WriteBatch::Handler { + public: + SequenceNumber sequence_; + MemTable* mem_; + const Options* options_; + DBImpl* db_; + const bool filter_deletes_; + + MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts, + DB* db, const bool filter_deletes) + : sequence_(sequence), + mem_(mem), + options_(opts), + db_(reinterpret_cast(db)), + filter_deletes_(filter_deletes) { + assert(mem_); + if (filter_deletes_) { + assert(options_); + assert(db_); + } + } + + virtual void Put(const Slice& key, const Slice& value) { + if (!options_->inplace_update_support) { + mem_->Add(sequence_, kTypeValue, key, value); + } else if (options_->inplace_callback == nullptr) { + mem_->Update(sequence_, key, value); + RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED); + } else { + if (mem_->UpdateCallback(sequence_, key, value, *options_)) { + } else { + // key not found in memtable. Do sst get, update, add + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + + std::string prev_value; + std::string merged_value; + Status s = db_->Get(ropts, key, &prev_value); + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = prev_value.size(); + auto status = + options_->inplace_callback(s.ok() ? prev_buffer: nullptr, + s.ok() ? &prev_size: nullptr, + value, &merged_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // prev_value is updated in-place with final value. + mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); + RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN); + } else if (status == UpdateStatus::UPDATED) { + // merged_value contains the final value. + mem_->Add(sequence_, kTypeValue, key, Slice(merged_value)); + RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN); + } + } + } + // Since all Puts are logged in trasaction logs (if enabled), always bump + // sequence number. Even if the update eventually fails and does not result + // in memtable add/update. + sequence_++; + } + + virtual void Merge(const Slice& key, const Slice& value) { + bool perform_merge = false; + + if (options_->max_successive_merges > 0 && db_ != nullptr) { + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= options_->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + db_->Get(read_options, key, &get_value); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = options_->merge_operator.get(); + assert(merge_operator); + + std::deque operands; + operands.push_front(value.ToString()); + std::string new_value; + if (!merge_operator->FullMerge(key, + &get_value_slice, + operands, + &new_value, + options_->info_log.get())) { + // Failed to merge! + RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES); + + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + mem_->Add(sequence_, kTypeValue, key, new_value); + } + } + + if (!perform_merge) { + // Add merge operator to memtable + mem_->Add(sequence_, kTypeMerge, key, value); + } + + sequence_++; + } + virtual void Delete(const Slice& key) { + if (filter_deletes_) { + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + std::string value; + if (!db_->KeyMayExist(ropts, key, &value)) { + RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES); + return; + } + } + mem_->Add(sequence_, kTypeDeletion, key, Slice()); + sequence_++; + } +}; +} // namespace + +Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem, + const Options* opts, DB* db, + const bool filter_deletes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db, + filter_deletes); + return b->Iterate(&inserter); +} + +void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= kHeader); + b->rep_.assign(contents.data(), contents.size()); +} + +void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { + SetCount(dst, Count(dst) + Count(src)); + assert(src->rep_.size() >= kHeader); + dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); +} + +} // namespace rocksdb diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h new file mode 100644 index 00000000..b8991732 --- /dev/null +++ b/db/write_batch_internal.h @@ -0,0 +1,57 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +class MemTable; + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + // Return the number of entries in the batch. + static int Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, int n); + + // Return the seqeunce number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the seqeunce number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static void SetContents(WriteBatch* batch, const Slice& contents); + + // Inserts batch entries into memtable + // Drops deletes in batch if filter_del is set to true and + // db->KeyMayExist returns false + static Status InsertInto(const WriteBatch* batch, MemTable* memtable, + const Options* opts, DB* db = nullptr, + const bool filter_del = false); + + static void Append(WriteBatch* dst, const WriteBatch* src); +}; + +} // namespace rocksdb diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc new file mode 100644 index 00000000..d3454c34 --- /dev/null +++ b/db/write_batch_test.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "util/logging.h" +#include "util/testharness.h" + +namespace rocksdb { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + Options options; + options.memtable_factory = factory; + MemTable* mem = new MemTable(cmp, options); + mem->Ref(); + std::string state; + Status s = WriteBatchInternal::InsertInto(b, mem, &options); + int count = 0; + Iterator* iter = mem->NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + memset((void *)&ikey, 0, sizeof(ikey)); + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + return state; +} + +class WriteBatchTest { }; + +TEST(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(0, batch.Count()); +} + +TEST(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); + ASSERT_EQ("Put(baz, boo)@102" + "Delete(box)@101" + "Put(foo, bar)@100", + PrintContents(&batch)); + ASSERT_EQ(3, batch.Count()); +} + +TEST(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "Corruption: bad WriteBatch Delete", + PrintContents(&batch)); +} + +TEST(WriteBatchTest, Append) { + WriteBatch b1, b2; + WriteBatchInternal::SetSequence(&b1, 200); + WriteBatchInternal::SetSequence(&b2, 300); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("", + PrintContents(&b1)); + ASSERT_EQ(0, b1.Count()); + b2.Put("a", "va"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200", + PrintContents(&b1)); + ASSERT_EQ(1, b1.Count()); + b2.Clear(); + b2.Put("b", "vb"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); + ASSERT_EQ(2, b1.Count()); + b2.Delete("foo"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(4, b1.Count()); +} + +namespace { + struct TestHandler : public WriteBatch::Handler { + std::string seen; + virtual void Put(const Slice& key, const Slice& value) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } + virtual void Merge(const Slice& key, const Slice& value) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } + virtual void LogData(const Slice& blob) { + seen += "LogData(" + blob.ToString() + ")"; + } + virtual void Delete(const Slice& key) { + seen += "Delete(" + key.ToString() + ")"; + } + }; +} + +TEST(WriteBatchTest, Blob) { + WriteBatch batch; + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.Put(Slice("k3"), Slice("v3")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k2")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_EQ(5, batch.Count()); + ASSERT_EQ("Merge(foo, bar)@4" + "Put(k1, v1)@0" + "Delete(k2)@3" + "Put(k2, v2)@1" + "Put(k3, v3)@2", + PrintContents(&batch)); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "Put(k3, v3)" + "LogData(blob1)" + "Delete(k2)" + "LogData(blob2)" + "Merge(foo, bar)", + handler.seen); +} + +TEST(WriteBatchTest, Continue) { + WriteBatch batch; + + struct Handler : public TestHandler { + int num_seen = 0; + virtual void Put(const Slice& key, const Slice& value) { + ++num_seen; + TestHandler::Put(key, value); + } + virtual void Merge(const Slice& key, const Slice& value) { + ++num_seen; + TestHandler::Merge(key, value); + } + virtual void LogData(const Slice& blob) { + ++num_seen; + TestHandler::LogData(blob); + } + virtual void Delete(const Slice& key) { + ++num_seen; + TestHandler::Delete(key); + } + virtual bool Continue() override { + return num_seen < 3; + } + } handler; + + batch.Put(Slice("k1"), Slice("v1")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k1")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "LogData(blob1)" + "Delete(k1)", + handler.seen); +} + +TEST(WriteBatchTest, PutGatherSlices) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + + { + // Try a write where the key is one slice but the value is two + Slice key_slice("baz"); + Slice value_slices[2] = { Slice("header"), Slice("payload") }; + batch.Put(SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + } + + { + // One where the key is composite but the value is a single slice + Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; + Slice value_slice("value"); + batch.Put(SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + } + + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ("Put(baz, headerpayload)@101" + "Put(foo, bar)@100" + "Put(keypart2part3, value)@102", + PrintContents(&batch)); + ASSERT_EQ(3, batch.Count()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/doc/doc.css b/doc/doc.css new file mode 100644 index 00000000..700c564e --- /dev/null +++ b/doc/doc.css @@ -0,0 +1,89 @@ +body { + margin-left: 0.5in; + margin-right: 0.5in; + background: white; + color: black; +} + +h1 { + margin-left: -0.2in; + font-size: 14pt; +} +h2 { + margin-left: -0in; + font-size: 12pt; +} +h3 { + margin-left: -0in; +} +h4 { + margin-left: -0in; +} +hr { + margin-left: -0in; +} + +/* Definition lists: definition term bold */ +dt { + font-weight: bold; +} + +address { + text-align: center; +} +code,samp,var { + color: blue; +} +kbd { + color: #600000; +} +div.note p { + float: right; + width: 3in; + margin-right: 0%; + padding: 1px; + border: 2px solid #6060a0; + background-color: #fffff0; +} + +ul { + margin-top: -0em; + margin-bottom: -0em; +} + +ol { + margin-top: -0em; + margin-bottom: -0em; +} + +UL.nobullets { + list-style-type: none; + list-style-image: none; + margin-left: -1em; +} + +p { + margin: 1em 0 1em 0; + padding: 0 0 0 0; +} + +pre { + line-height: 1.3em; + padding: 0.4em 0 0.8em 0; + margin: 0 0 0 0; + border: 0 0 0 0; + color: blue; +} + +.datatable { + margin-left: auto; + margin-right: auto; + margin-top: 2em; + margin-bottom: 2em; + border: 1px solid; +} + +.datatable td,th { + padding: 0 0.5em 0 0.5em; + text-align: right; +} diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 00000000..71f515e7 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,831 @@ + + + + +RocksDB + + + +

RocksDB

+
The Facebook Database Engineering Team
+
Build on earlier work on leveldb by Sanjay Ghemawat + (sanjay@google.com) and Jeff Dean (jeff@google.com)
+

+The rocksdb library provides a persistent key value store. Keys and +values are arbitrary byte arrays. The keys are ordered within the key +value store according to a user-specified comparator function. + +

+

Opening A Database

+

+A rocksdb database has a name which corresponds to a file system +directory. All of the contents of database are stored in this +directory. The following example shows how to open a database, +creating it if necessary: +

+

+  #include <assert>
+  #include "rocksdb/db.h"
+
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+If you want to raise an error if the database already exists, add +the following line before the rocksdb::DB::Open call: +
+  options.error_if_exists = true;
+
+

Status

+

+You may have noticed the rocksdb::Status type above. Values of this +type are returned by most functions in rocksdb that may encounter an +error. You can check if such a result is ok, and also print an +associated error message: +

+

+   rocksdb::Status s = ...;
+   if (!s.ok()) cerr << s.ToString() << endl;
+
+

Closing A Database

+

+When you are done with a database, just delete the database object. +Example: +

+

+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+
+

Reads And Writes

+

+The database provides Put, Delete, and Get methods to +modify/query the database. For example, the following code +moves the value stored under key1 to key2. +

+  std::string value;
+  rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+  if (s.ok()) s = db->Put(rocksdb::WriteOptions(), key2, value);
+  if (s.ok()) s = db->Delete(rocksdb::WriteOptions(), key1);
+
+ +

Atomic Updates

+

+Note that if the process dies after the Put of key2 but before the +delete of key1, the same value may be left stored under multiple keys. +Such problems can be avoided by using the WriteBatch class to +atomically apply a set of updates: +

+

+  #include "rocksdb/write_batch.h"
+  ...
+  std::string value;
+  rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+  if (s.ok()) {
+    rocksdb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db->Write(rocksdb::WriteOptions(), &batch);
+  }
+
+The WriteBatch holds a sequence of edits to be made to the database, +and these edits within the batch are applied in order. Note that we +called Delete before Put so that if key1 is identical to key2, +we do not end up erroneously dropping the value entirely. +

+Apart from its atomicity benefits, WriteBatch may also be used to +speed up bulk updates by placing lots of individual mutations into the +same batch. + +

Synchronous Writes

+By default, each write to leveldb is asynchronous: it +returns after pushing the write from the process into the operating +system. The transfer from operating system memory to the underlying +persistent storage happens asynchronously. The sync flag +can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling +either fsync(...) or fdatasync(...) or +msync(..., MS_SYNC) before the write operation returns.) +
+  rocksdb::WriteOptions write_options;
+  write_options.sync = true;
+  db->Put(write_options, ...);
+
+Asynchronous writes are often more than a thousand times as fast as +synchronous writes. The downside of asynchronous writes is that a +crash of the machine may cause the last few updates to be lost. Note +that a crash of just the writing process (i.e., not a reboot) will not +cause any loss since even when sync is false, an update +is pushed from the process memory into the operating system before it +is considered done. + +

+Asynchronous writes can often be used safely. For example, when +loading a large amount of data into the database you can handle lost +updates by restarting the bulk load after a crash. A hybrid scheme is +also possible where every Nth write is synchronous, and in the event +of a crash, the bulk load is restarted just after the last synchronous +write finished by the previous run. (The synchronous write can update +a marker that describes where to restart on a crash.) + +

+WriteBatch provides an alternative to asynchronous writes. +Multiple updates may be placed in the same WriteBatch and +applied together using a synchronous write (i.e., +write_options.sync is set to true). The extra cost of +the synchronous write will be amortized across all of the writes in +the batch. + +

+We also provide a way to completely disable Write Ahead Log for a +particular write. If you set write_option.disableWAL to true, the +write will not go to the log at all and may be lost in an event of +process crash. + +

+When opening a DB, you can disable syncing of data files by setting +Options::disableDataSync to true. This can be useful when doing +bulk-loading or big idempotent operations. Once the operation is +finished, you can manually call sync() to flush all dirty buffers +to stable storage. + +

+RocksDB by default uses faster fdatasync() to sync files. If you want +to use fsync(), you can set Options::use_fsync to true. You should set +this to true on filesystems like ext3 that can lose files after a +reboot. + +

+

Concurrency

+

+A database may only be opened by one process at a time. +The rocksdb implementation acquires a lock from the +operating system to prevent misuse. Within a single process, the +same rocksdb::DB object may be safely shared by multiple +concurrent threads. I.e., different threads may write into or fetch +iterators or call Get on the same database without any +external synchronization (the leveldb implementation will +automatically do the required synchronization). However other objects +(like Iterator and WriteBatch) may require external synchronization. +If two threads share such an object, they must protect access to it +using their own locking protocol. More details are available in +the public header files. + +

+

Merge operators

+

+Merge operators provide efficient support for read-modify-write operation. +More on the interface and implementation can be found on: +

+ + Merge Operator +

+ + Merge Operator Implementation + +

+

Iteration

+

+The following example demonstrates how to print all key,value pairs +in a database. +

+

+  rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+  delete it;
+
+The following variation shows how to process just the keys in the +range [start,limit): +

+

+  for (it->Seek(start);
+       it->Valid() && it->key().ToString() < limit;
+       it->Next()) {
+    ...
+  }
+
+You can also process entries in reverse order. (Caveat: reverse +iteration may be somewhat slower than forward iteration.) +

+

+  for (it->SeekToLast(); it->Valid(); it->Prev()) {
+    ...
+  }
+
+

Snapshots

+

+Snapshots provide consistent read-only views over the entire state of +the key-value store. ReadOptions::snapshot may be non-NULL to indicate +that a read should operate on a particular version of the DB state. +If ReadOptions::snapshot is NULL, the read will operate on an +implicit snapshot of the current state. +

+Snapshots are created by the DB::GetSnapshot() method: +

+

+  rocksdb::ReadOptions options;
+  options.snapshot = db->GetSnapshot();
+  ... apply some updates to db ...
+  rocksdb::Iterator* iter = db->NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db->ReleaseSnapshot(options.snapshot);
+
+Note that when a snapshot is no longer needed, it should be released +using the DB::ReleaseSnapshot interface. This allows the +implementation to get rid of state that was being maintained just to +support reading as of that snapshot. +

Slice

+

+The return value of the it->key() and it->value() calls above +are instances of the rocksdb::Slice type. Slice is a simple +structure that contains a length and a pointer to an external byte +array. Returning a Slice is a cheaper alternative to returning a +std::string since we do not need to copy potentially large keys and +values. In addition, rocksdb methods do not return null-terminated +C-style strings since rocksdb keys and values are allowed to +contain '\0' bytes. +

+C++ strings and null-terminated C-style strings can be easily converted +to a Slice: +

+

+   rocksdb::Slice s1 = "hello";
+
+   std::string str("world");
+   rocksdb::Slice s2 = str;
+
+A Slice can be easily converted back to a C++ string: +
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+
+Be careful when using Slices since it is up to the caller to ensure that +the external byte array into which the Slice points remains live while +the Slice is in use. For example, the following is buggy: +

+

+   rocksdb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+
+When the if statement goes out of scope, str will be destroyed and the +backing storage for slice will disappear. +

+

Comparators

+

+The preceding examples used the default ordering function for key, +which orders bytes lexicographically. You can however supply a custom +comparator when opening a database. For example, suppose each +database key consists of two numbers and we should sort by the first +number, breaking ties by the second number. First, define a proper +subclass of rocksdb::Comparator that expresses these rules: +

+

+  class TwoPartComparator : public rocksdb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a < b: negative result
+    //   if a > b: positive result
+    //   else: zero result
+    int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &a1, &a2);
+      ParseKey(b, &b1, &b2);
+      if (a1 < b1) return -1;
+      if (a1 > b1) return +1;
+      if (a2 < b2) return -1;
+      if (a2 > b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const rocksdb::Slice&) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+
+Now create a database using this custom comparator: +

+

+  TwoPartComparator cmp;
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &cmp;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  ...
+
+

Backwards compatibility

+

+The result of the comparator's Name method is attached to the +database when it is created, and is checked on every subsequent +database open. If the name changes, the rocksdb::DB::Open call will +fail. Therefore, change the name if and only if the new key format +and comparison function are incompatible with existing databases, and +it is ok to discard the contents of all existing databases. +

+You can however still gradually evolve your key format over time with +a little bit of pre-planning. For example, you could store a version +number at the end of each key (one byte should suffice for most uses). +When you wish to switch to a new key format (e.g., adding an optional +third part to the keys processed by TwoPartComparator), +(a) keep the same comparator name (b) increment the version number +for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. + + +

+

MemTable and Table factories

+

+By default, we keep the data in memory in skiplist memtable and the data +on disk in a table format described here: + + RocksDB Table Format. +

+Since one of the goals of RocksDB is to have +different parts of the system easily pluggable, we support different +implementations of both memtable and table format. You can supply +your own memtable factory by setting Options::memtable_factory +and your own table factory by setting Options::table_factory. +For available memtable factories, please refer to +rocksdb/memtablerep.h and for table factores to +rocksdb/table.h. These features are both in active development +and please be wary of any API changes that might break your application +going forward. +

+You can also read more about memtables here: + +Memtables wiki + + +

+

Performance

+

+Performance can be tuned by changing the default values of the +types defined in include/rocksdb/options.h. + +

+

Block size

+

+rocksdb groups adjacent keys together into the same block and such a +block is the unit of transfer to and from persistent storage. The +default block size is approximately 4096 uncompressed bytes. +Applications that mostly do bulk scans over the contents of the +database may wish to increase this size. Applications that do a lot +of point reads of small values may wish to switch to a smaller block +size if performance measurements indicate an improvement. There isn't +much benefit in using blocks smaller than one kilobyte, or larger than +a few megabytes. Also note that compression will be more effective +with larger block sizes. To change block size parameter, use +Options::block_size. +

+

Write buffer

+

+Options::write_buffer_size specifies the amount of data +to build up in memory before converting to a sorted on-disk file. +Larger values increase performance, especially during bulk loads. +Up to max_write_buffer_number write buffers may be held in memory +at the same time, +so you may wish to adjust this parameter to control memory usage. +Also, a larger write buffer will result in a longer recovery time +the next time the database is opened. +Related option is +Options::max_write_buffer_number, which is maximum number +of write buffers that are built up in memory. The default is 2, so that +when 1 write buffer is being flushed to storage, new writes can continue +to the other write buffer. +Options::min_write_buffer_number_to_merge is the minimum number +of write buffers that will be merged together before writing to storage. +If set to 1, then all write buffers are flushed to L0 as individual files and +this increases read amplification because a get request has to check in all +of these files. Also, an in-memory merge may result in writing lesser +data to storage if there are duplicate records in each of these +individual write buffers. Default: 1 +

+

Compression

+

+Each block is individually compressed before being written to +persistent storage. Compression is on by default since the default +compression method is very fast, and is automatically disabled for +uncompressible data. In rare cases, applications may want to disable +compression entirely, but should only do so if benchmarks show a +performance improvement: +

+

+  rocksdb::Options options;
+  options.compression = rocksdb::kNoCompression;
+  ... rocksdb::DB::Open(options, name, ...) ....
+
+

Cache

+

+The contents of the database are stored in a set of files in the +filesystem and each file stores a sequence of compressed blocks. If +options.block_cache is non-NULL, it is used to cache frequently +used uncompressed block contents. If options.block_cache_compressed +is non-NULL, it is used to cache frequently used compressed blocks. Compressed +cache is an alternative to OS cache, which also caches compressed blocks. If +compressed cache is used, the OS cache will be disabled automatically by setting +options.allow_os_buffer to false. +

+

+  #include "rocksdb/cache.h"
+
+  rocksdb::Options options;
+  options.block_cache = rocksdb::NewLRUCache(100 * 1048576);  // 100MB uncompressed cache
+  options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576);  // 100MB compressed cache
+  rocksdb::DB* db;
+  rocksdb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.block_cache;
+  delete options.block_cache_compressed;
+
+

+When performing a bulk read, the application may wish to disable +caching so that the data processed by the bulk read does not end up +displacing most of the cached contents. A per-iterator option can be +used to achieve this: +

+

+  rocksdb::ReadOptions options;
+  options.fill_cache = false;
+  rocksdb::Iterator* it = db->NewIterator(options);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ...
+  }
+
+

+You can also disable block cache by setting options.no_block_cache +to true. +

Key Layout

+

+Note that the unit of disk transfer and caching is a block. Adjacent +keys (according to the database sort order) will usually be placed in +the same block. Therefore the application can improve its performance +by placing keys that are accessed together near each other and placing +infrequently used keys in a separate region of the key space. +

+For example, suppose we are implementing a simple file system on top +of rocksdb. The types of entries we might wish to store are: +

+

+   filename -> permission-bits, length, list of file_block_ids
+   file_block_id -> data
+
+We might want to prefix filename keys with one letter (say '/') and the +file_block_id keys with a different letter (say '0') so that scans +over just the metadata do not force us to fetch and cache bulky file +contents. +

+

Filters

+

+Because of the way rocksdb data is organized on disk, +a single Get() call may involve multiple reads from disk. +The optional FilterPolicy mechanism can be used to reduce +the number of disk reads substantially. +

+   rocksdb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   rocksdb::DB* db;
+   rocksdb::DB::Open(options, "/tmp/testdb", &db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+
+The preceding code associates a +Bloom filter +based filtering policy with the database. Bloom filter based +filtering relies on keeping some number of bits of data in memory per +key (in this case 10 bits per key since that is the argument we passed +to NewBloomFilter). This filter will reduce the number of unnecessary +disk reads needed for Get() calls by a factor of +approximately a 100. Increasing the bits per key will lead to a +larger reduction at the cost of more memory usage. We recommend that +applications whose working set does not fit in memory and that do a +lot of random reads set a filter policy. +

+If you are using a custom comparator, you should ensure that the filter +policy you are using is compatible with your comparator. For example, +consider a comparator that ignores trailing spaces when comparing keys. +NewBloomFilter must not be used with such a comparator. +Instead, the application should provide a custom filter policy that +also ignores trailing spaces. For example: +

+  class CustomFilterPolicy : public rocksdb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector<Slice> trimmed(n);
+      for (int i = 0; i < n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+
+

+Advanced applications may provide a filter policy that does not use +a bloom filter but uses some other mechanism for summarizing a set +of keys. See rocksdb/filter_policy.h for detail. +

+

Checksums

+

+rocksdb associates checksums with all data it stores in the file system. +There are two separate controls provided over how aggressively these +checksums are verified: +

+

    +
  • ReadOptions::verify_checksums may be set to true to force + checksum verification of all data that is read from the file system on + behalf of a particular read. By default, no such verification is + done. +

    +

  • Options::paranoid_checks may be set to true before opening a + database to make the database implementation raise an error as soon as + it detects an internal corruption. Depending on which portion of the + database has been corrupted, the error may be raised when the database + is opened, or later by another database operation. By default, + paranoid checking is off so that the database can be used even if + parts of its persistent storage have been corrupted. +

    + If a database is corrupted (perhaps it cannot be opened when + paranoid checking is turned on), the rocksdb::RepairDB function + may be used to recover as much of the data as possible. +

    +

+ +

+

Compaction

+

+You can read more on Compactions here: + + Multi-threaded compactions + +

+Here we give overview of the options that impact behavior of Compactions: +

    +

    +

  • Options::compaction_style - RocksDB currently supports two +compaction algorithms - Universal style and Level style. This option switches +between the two. Can be kCompactionStyleUniversal or kCompactionStyleLevel. +If this is kCompactionStyleUniversal, then you can configure universal style +parameters with Options::compaction_options_universal. +

    +

  • Options::disable_auto_compactions - Disable automatic compactions. +Manual compactions can still be issued on this database. +

    +

  • Options::compaction_filter - Allows an application to modify/delete +a key-value during background compaction. The client must provide +compaction_filter_factory if it requires a new compaction filter to be used +for different compaction processes. Client should specify only one of filter +or factory. +

    +

  • Options::compaction_filter_factory - a factory that provides +compaction filter objects which allow an application to modify/delete a +key-value during background compaction. +
+

+Other options impacting performance of compactions and when they get triggered +are: +

    +

    +

  • Options::access_hint_on_compaction_start - Specify the file access +pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL +

    +

  • Options::level0_file_num_compaction_trigger - Number of files to trigger level-0 compaction. +A negative value means that level-0 compaction will not be triggered by number of files at all. +

    +

  • Options::max_mem_compaction_level - Maximum level to which a new compacted memtable is pushed if it +does not create overlap. We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some +expensive manifest file operations. We do not push all the way to the largest level since that can generate a lot of wasted disk +space if the same key space is being repeatedly overwritten. +

    +

  • Options::target_file_size_base and Options::target_file_size_multiplier - +Target file size for compaction. target_file_size_base is per-file size for level-1. +Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1)) +For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will +be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB +and default target_file_size_multiplier is 1. +

    +

  • Options::expanded_compaction_factor - Maximum number of bytes in all compacted files. We avoid expanding +the lower level file set of a compaction if it would make the total compaction cover more than +(expanded_compaction_factor * targetFileSizeLevel()) many bytes. +

    +

  • Options::source_compaction_factor - Maximum number of bytes in all source files to be compacted in a +single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes +for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes. +Default:1, i.e. pick maxfilesize amount of data as the source of a compaction. +

    +

  • Options::max_grandparent_overlap_factor - Control maximum bytes of overlaps in grandparent (i.e., level+2) before we +stop building a single file in a level->level+1 compaction. +

    +

  • Options::disable_seek_compaction - Disable compaction triggered by seek. +With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache +(which is true if max_open_files is large). +

    +

  • Options::max_background_compactions - Maximum number of concurrent background jobs, submitted to +the default LOW priority thread pool +
+ +

+You can learn more about all of those options in rocksdb/options.h + +

Universal style compaction specific settings

+

+If you're using Universal style compaction, there is an object CompactionOptionsUniversal +that hold all the different options for that compaction. The exact definition is in +rocksdb/universal_compaction.h and you can set it in Options::compaction_options_universal. +Here we give short overview of options in CompactionOptionsUniversal: +

    +

    +

  • CompactionOptionsUniversal::size_ratio - Percentage flexibility while comparing file size. If the candidate file(s) + size is 1% smaller than the next file's size, then include next file into + this candidate set. Default: 1 +

    +

  • CompactionOptionsUniversal::min_merge_width - The minimum number of files in a single compaction run. Default: 2 +

    +

  • CompactionOptionsUniversal::max_merge_width - The maximum number of files in a single compaction run. Default: UINT_MAX +

    +

  • CompactionOptionsUniversal::max_size_amplification_percent - The size amplification is defined as the amount (in percentage) of +additional storage needed to store a single byte of data in the database. For example, a size amplification of 2% means that a database that +contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has +a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding +the earliest file contribute to the size amplification. Default: 200, which means that a 100 byte database could require upto +300 bytes of storage. +

    +

  • CompactionOptionsUniversal::compression_size_percent - If this option is set to be -1 (the default value), all the output files +will follow compression type specified. If this option is not negative, we will try to make sure compressed +size is just above this value. In normal cases, at least this percentage +of data will be compressed. +When we are compacting to a new file, here is the criteria whether +it needs to be compressed: assuming here are the list of files sorted +by generation time: [ A1...An B1...Bm C1...Ct ], +where A1 is the newest and Ct is the oldest, and we are going to compact +B1...Bm, we calculate the total size of all the files as total_size, as +well as the total size of C1...Ct as total_C, the compaction output file +will be compressed iff total_C / total_size < this percentage +

    +

  • CompactionOptionsUniversal::stop_style - The algorithm used to stop picking files into a single compaction run. +Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file). +Default: kCompactionStopStyleTotalSize +
+ +

Thread pools

+

+A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background +threads using method Env::SetBackgroundThreads() defined in rocksdb/env.h. +We use the thread pool for compactions and memtable flushes. +Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest +having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool. +There are two options available for configuration of background compactions and flushes: +

    +

    +

  • Options::max_background_compactions - Maximum number of concurrent background jobs, +submitted to the default LOW priority thread pool +

    +

  • Options::max_background_flushes - Maximum number of concurrent background memtable flush jobs, submitted to +the HIGH priority thread pool. By default, all background jobs (major compaction and memtable flush) go +to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool. +It is important when the same Env is shared by multiple db instances. Without a separate pool, long running major compaction jobs could +potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls. +
+

+

+  #include "rocksdb/env.h"
+  #include "rocksdb/db.h"
+
+  auto env = rocksdb::Env::Default();
+  env->SetBackgroundThreads(2, rocksdb::Env::LOW);
+  env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.env = env;
+  options.max_background_compactions = 2;
+  options.max_background_flushes = 1;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
+  assert(status.ok());
+  ...
+
+

Approximate Sizes

+

+The GetApproximateSizes method can used to get the approximate +number of bytes of file system space used by one or more key ranges. +

+

+   rocksdb::Range ranges[2];
+   ranges[0] = rocksdb::Range("a", "c");
+   ranges[1] = rocksdb::Range("x", "z");
+   uint64_t sizes[2];
+   rocksdb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
+
+The preceding call will set sizes[0] to the approximate number of +bytes of file system space used by the key range [a..c) and +sizes[1] to the approximate number of bytes used by the key range +[x..z). +

+

Environment

+

+All file operations (and other operating system calls) issued by the +rocksdb implementation are routed through a rocksdb::Env object. +Sophisticated clients may wish to provide their own Env +implementation to get better control. For example, an application may +introduce artificial delays in the file IO paths to limit the impact +of rocksdb on other activities in the system. +

+

+  class SlowEnv : public rocksdb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  rocksdb::Options options;
+  options.env = &env;
+  Status s = rocksdb::DB::Open(options, ...);
+
+

Porting

+

+rocksdb may be ported to a new platform by providing platform +specific implementations of the types/methods/functions exported by +rocksdb/port/port.h. See rocksdb/port/port_example.h for more +details. +

+In addition, the new platform may need a new default rocksdb::Env +implementation. See rocksdb/util/env_posix.h for an example. + +

Statistics

+

+To be able to efficiently tune your application, it is always helpful if you +have access to usage statistics. You can collect those statistics by setting +Options::table_properties_collectors or +Options::statistics. For more information, refer to +rocksdb/table_properties.h and rocksdb/statistics.h. +These should not add significant overhead to your application and we +recommend exporting them to other monitoring tools. + +

Purging WAL files

+

+By default, old write-ahead logs are deleted automatically when they fall out +of scope and application doesn't need them anymore. There are options that +enable the user to archive the logs and then delete them lazily, either in +TTL fashion or based on size limit. + +The options are Options::WAL_ttl_seconds and +Options::WAL_size_limit_MB. Here is how they can be used: +

    +
  • +

    +If both set to 0, logs will be deleted asap and will never get into the archive. +

  • +

    +If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, WAL +files will be checked every 10 min and if total size is greater then +WAL_size_limit_MB, they will be deleted starting with the +earliest until size_limit is met. All empty files will be deleted. +

  • +

    +If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then +WAL files will be checked every WAL_ttl_seconds / 2 and those +that are older than WAL_ttl_seconds will be deleted. +

  • +

    +If both are not 0, WAL files will be checked every 10 min and both +checks will be performed with ttl being first. +

+ +

Other Information

+

+Details about the rocksdb implementation may be found in +the following documents: +

+ + + diff --git a/doc/log_format.txt b/doc/log_format.txt new file mode 100644 index 00000000..3a0414b6 --- /dev/null +++ b/doc/log_format.txt @@ -0,0 +1,75 @@ +The log file contents are a sequence of 32KB blocks. The only +exception is that the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] + length: uint16 + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it +won't fit). Any leftover bytes here form the trailer, which must +consist entirely of zero bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new +non-zero length record is added, the writer must emit a FIRST record +(which contains zero bytes of user data) to fill up the trailing seven +bytes of the block and then emit all of the user data in subsequent +blocks. + +More types may be added in the future. Some Readers may skip record +types they do not understand, others may report that some data was +skipped. + +FULL == 1 +FIRST == 2 +MIDDLE == 3 +LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been +split into multiple fragments (typically because of block boundaries). +FIRST is the type of the first fragment of a user record, LAST is the +type of the last fragment of a user record, and MID is the type of all +interior fragments of a user record. + +Example: consider a sequence of user records: + A: length 1000 + B: length 97270 + C: length 8000 +A will be stored as a FULL record in the first block. + +B will be split into three fragments: first fragment occupies the rest +of the first block, second fragment occupies the entirety of the +second block, and the third fragment occupies a prefix of the third +block. This will leave six bytes free in the third block, which will +be left empty as the trailer. + +C will be stored as a FULL record in the fourth block. + +=================== + +Some benefits over the recordio format: + +(1) We do not need any heuristics for resyncing - just go to next +block boundary and scan. If there is a corruption, skip to the next +block. As a side-benefit, we do not get confused when part of the +contents of one log file are embedded as a record inside another log +file. + +(2) Splitting at approximate boundaries (e.g., for mapreduce) is +simple: find the next block boundary and skip records until we +hit a FULL or FIRST record. + +(3) We do not need extra buffering for large records. + +Some downsides compared to recordio format: + +(1) No packing of tiny records. This could be fixed by adding a new +record type, so it is a shortcoming of the current implementation, +not necessarily the format. + +(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/rockslogo.jpg b/doc/rockslogo.jpg new file mode 100644 index 00000000..363905af Binary files /dev/null and b/doc/rockslogo.jpg differ diff --git a/doc/rockslogo.png b/doc/rockslogo.png new file mode 100644 index 00000000..19613607 Binary files /dev/null and b/doc/rockslogo.png differ diff --git a/hdfs/README b/hdfs/README new file mode 100644 index 00000000..9b7d0a64 --- /dev/null +++ b/hdfs/README @@ -0,0 +1,26 @@ +This directory contains the hdfs extensions needed to make rocksdb store +files in HDFS. + +The hdfs.h file is copied from the Apache Hadoop 1.0 source code. +It defines the libhdfs library +(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access +data in HDFS. The libhdfs.a is copied from the Apache Hadoop 1.0 build. +It implements the API defined in hdfs.h. If your hadoop cluster is running +a different hadoop release, then install these two files manually from your +hadoop distribution and then recompile rocksdb. + +The env_hdfs.h file defines the rocksdb objects that are needed to talk to an +underlying filesystem. + +If you want to compile rocksdb with hdfs support, please set the following +enviroment variables appropriately: + USE_HDFS=1 + JAVA_HOME=/usr/local/jdk-6u22-64 + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs + make clean all db_bench + +To run dbbench, + set CLASSPATH to include your hadoop distribution + db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000" + + diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h new file mode 100644 index 00000000..303cd81c --- /dev/null +++ b/hdfs/env_hdfs.h @@ -0,0 +1,323 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#ifdef USE_HDFS +#include "hdfs/hdfs.h" + +namespace rocksdb { + +static const std::string kProto = "hdfs://"; +static const std::string pathsep = "/"; + +// Thrown during execution when there is an issue with the supplied +// arguments. +class HdfsUsageException : public std::exception { }; + +// A simple exception that indicates something went wrong that is not +// recoverable. The intention is for the message to be printed (with +// nothing else) and the process terminate. +class HdfsFatalException : public std::exception { +public: + explicit HdfsFatalException(const std::string& s) : what_(s) { } + virtual ~HdfsFatalException() throw() { } + virtual const char* what() const throw() { + return what_.c_str(); + } +private: + const std::string what_; +}; + +// +// The HDFS environment for rocksdb. This class overrides all the +// file/dir access methods and delegates the thread-mgmt methods to the +// default posix environment. +// +class HdfsEnv : public Env { + + public: + explicit HdfsEnv(const std::string& fsname) : fsname_(fsname) { + posixEnv = Env::Default(); + fileSys_ = connectToPath(fsname_); + } + + virtual ~HdfsEnv() { + fprintf(stderr, "Destroying HdfsEnv::Default()\n"); + hdfsDisconnect(fileSys_); + } + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result); + + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result); + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result); + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options); + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result); + + virtual bool FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& path, + std::vector* result); + + virtual Status DeleteFile(const std::string& fname); + + virtual Status CreateDir(const std::string& name); + + virtual Status CreateDirIfMissing(const std::string& name); + + virtual Status DeleteDir(const std::string& name); + + virtual Status GetFileSize(const std::string& fname, uint64_t* size); + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime); + + virtual Status RenameFile(const std::string& src, const std::string& target); + + virtual Status LockFile(const std::string& fname, FileLock** lock); + + virtual Status UnlockFile(FileLock* lock); + + virtual Status NewLogger(const std::string& fname, Logger** result); + + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW) { + posixEnv->Schedule(function, arg, pri); + } + + virtual void StartThread(void (*function)(void* arg), void* arg) { + posixEnv->StartThread(function, arg); + } + + virtual void WaitForJoin() { posixEnv->WaitForJoin(); } + + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const + override { + return posixEnv->GetThreadPoolQueueLen(pri); + } + + virtual Status GetTestDirectory(std::string* path) { + return posixEnv->GetTestDirectory(path); + } + + virtual uint64_t NowMicros() { + return posixEnv->NowMicros(); + } + + virtual void SleepForMicroseconds(int micros) { + posixEnv->SleepForMicroseconds(micros); + } + + virtual Status GetHostName(char* name, uint64_t len) { + return posixEnv->GetHostName(name, len); + } + + virtual Status GetCurrentTime(int64_t* unix_time) { + return posixEnv->GetCurrentTime(unix_time); + } + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return posixEnv->GetAbsolutePath(db_path, output_path); + } + + virtual void SetBackgroundThreads(int number, Priority pri = LOW) { + posixEnv->SetBackgroundThreads(number, pri); + } + + virtual std::string TimeToString(uint64_t number) { + return posixEnv->TimeToString(number); + } + + static uint64_t gettid() { + assert(sizeof(pthread_t) <= sizeof(uint64_t)); + return (uint64_t)pthread_self(); + } + + private: + std::string fsname_; // string of the form "hdfs://hostname:port/" + hdfsFS fileSys_; // a single FileSystem object for all files + Env* posixEnv; // This object is derived from Env, but not from + // posixEnv. We have posixnv as an encapsulated + // object here so that we can use posix timers, + // posix threads, etc. + + /** + * If the URI is specified of the form hdfs://server:port/path, + * then connect to the specified cluster + * else connect to default. + */ + hdfsFS connectToPath(const std::string& uri) { + if (uri.empty()) { + return nullptr; + } + if (uri.find(kProto) != 0) { + // uri doesn't start with hdfs:// -> use default:0, which is special + // to libhdfs. + return hdfsConnectNewInstance("default", 0); + } + const std::string hostport = uri.substr(kProto.length()); + + std::vector parts; + split(hostport, ':', parts); + if (parts.size() != 2) { + throw HdfsFatalException("Bad uri for hdfs " + uri); + } + // parts[0] = hosts, parts[1] = port/xxx/yyy + std::string host(parts[0]); + std::string remaining(parts[1]); + + int rem = remaining.find(pathsep); + std::string portStr = (rem == 0 ? remaining : + remaining.substr(0, rem)); + + tPort port; + port = atoi(portStr.c_str()); + if (port == 0) { + throw HdfsFatalException("Bad host-port for hdfs " + uri); + } + hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port); + return fs; + } + + void split(const std::string &s, char delim, + std::vector &elems) { + elems.clear(); + size_t prev = 0; + size_t pos = s.find(delim); + while (pos != std::string::npos) { + elems.push_back(s.substr(prev, pos)); + prev = pos + 1; + pos = s.find(delim, prev); + } + elems.push_back(s.substr(prev, s.size())); + } +}; + +} // namespace rocksdb + +#else // USE_HDFS + + +namespace rocksdb { + +static const Status notsup; + +class HdfsEnv : public Env { + + public: + explicit HdfsEnv(const std::string& fsname) { + fprintf(stderr, "You have not build rocksdb with HDFS support\n"); + fprintf(stderr, "Please see hdfs/README for details\n"); + throw std::exception(); + } + + virtual ~HdfsEnv() { + } + + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options); + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return notsup; + } + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + return notsup; + } + + virtual bool FileExists(const std::string& fname){return false;} + + virtual Status GetChildren(const std::string& path, + std::vector* result){return notsup;} + + virtual Status DeleteFile(const std::string& fname){return notsup;} + + virtual Status CreateDir(const std::string& name){return notsup;} + + virtual Status CreateDirIfMissing(const std::string& name){return notsup;} + + virtual Status DeleteDir(const std::string& name){return notsup;} + + virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;} + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time) { + return notsup; + } + + virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;} + + virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;} + + virtual Status UnlockFile(FileLock* lock){return notsup;} + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result){return notsup;} + + virtual void Schedule(void (*function)(void* arg), void* arg, + Priority pri = LOW) {} + + virtual void StartThread(void (*function)(void* arg), void* arg) {} + + virtual void WaitForJoin() {} + + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + return 0; + } + + virtual Status GetTestDirectory(std::string* path) {return notsup;} + + virtual uint64_t NowMicros() {return 0;} + + virtual void SleepForMicroseconds(int micros) {} + + virtual Status GetHostName(char* name, uint64_t len) {return notsup;} + + virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;} + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* outputpath) {return notsup;} + + virtual void SetBackgroundThreads(int number, Priority pri = LOW) {} + + virtual std::string TimeToString(uint64_t number) { return "";} +}; +} + +#endif // USE_HDFS diff --git a/hdfs/hdfs.h b/hdfs/hdfs.h new file mode 100644 index 00000000..8e8dfecb --- /dev/null +++ b/hdfs/hdfs.h @@ -0,0 +1,477 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef LIBHDFS_HDFS_H +#define LIBHDFS_HDFS_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef O_RDONLY +#define O_RDONLY 1 +#endif + +#ifndef O_WRONLY +#define O_WRONLY 2 +#endif + +#ifndef EINTERNAL +#define EINTERNAL 255 +#endif + + +/** All APIs set errno to meaningful values */ +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Some utility decls used in libhdfs. + */ + + typedef int32_t tSize; /// size of data for read/write io ops + typedef time_t tTime; /// time type in seconds + typedef int64_t tOffset;/// offset within the file + typedef uint16_t tPort; /// port + typedef enum tObjectKind { + kObjectKindFile = 'F', + kObjectKindDirectory = 'D', + } tObjectKind; + + + /** + * The C reflection of org.apache.org.hadoop.FileSystem . + */ + typedef void* hdfsFS; + + + /** + * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream . + */ + enum hdfsStreamType + { + UNINITIALIZED = 0, + INPUT = 1, + OUTPUT = 2, + }; + + + /** + * The 'file-handle' to a file in hdfs. + */ + struct hdfsFile_internal { + void* file; + enum hdfsStreamType type; + }; + typedef struct hdfsFile_internal* hdfsFile; + + + /** + * hdfsConnectAsUser - Connect to a hdfs file system as a specific user + * Connect to the hdfs. + * @param host A string containing either a host name, or an ip address + * of the namenode of a hdfs cluster. 'host' should be passed as NULL if + * you want to connect to local filesystem. 'host' should be passed as + * 'default' (and port as 0) to used the 'configured' filesystem + * (core-site/core-default.xml). + * @param port The port on which the server is listening. + * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port) + * @param groups the groups (these are hadoop domain groups) + * @return Returns a handle to the filesystem or NULL on error. + */ + hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size ); + + + /** + * hdfsConnect - Connect to a hdfs file system. + * Connect to the hdfs. + * @param host A string containing either a host name, or an ip address + * of the namenode of a hdfs cluster. 'host' should be passed as NULL if + * you want to connect to local filesystem. 'host' should be passed as + * 'default' (and port as 0) to used the 'configured' filesystem + * (core-site/core-default.xml). + * @param port The port on which the server is listening. + * @return Returns a handle to the filesystem or NULL on error. + */ + hdfsFS hdfsConnect(const char* host, tPort port); + + + /** + * This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle. + * Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance. + */ + hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size ); + hdfsFS hdfsConnectNewInstance(const char* host, tPort port); + hdfsFS hdfsConnectPath(const char* uri); + + /** + * hdfsDisconnect - Disconnect from the hdfs file system. + * Disconnect from hdfs. + * @param fs The configured filesystem handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsDisconnect(hdfsFS fs); + + + /** + * hdfsOpenFile - Open a hdfs file in given mode. + * @param fs The configured filesystem handle. + * @param path The full path to the file. + * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), + * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP. + * @param bufferSize Size of buffer for read/write - pass 0 if you want + * to use the default configured values. + * @param replication Block replication - pass 0 if you want to use + * the default configured values. + * @param blocksize Size of block - pass 0 if you want to use the + * default configured values. + * @return Returns the handle to the open file or NULL on error. + */ + hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags, + int bufferSize, short replication, tSize blocksize); + + + /** + * hdfsCloseFile - Close an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsCloseFile(hdfsFS fs, hdfsFile file); + + + /** + * hdfsExists - Checks if a given path exsits on the filesystem + * @param fs The configured filesystem handle. + * @param path The path to look for + * @return Returns 0 on exists, 1 on non-exists, -1/-2 on error. + */ + int hdfsExists(hdfsFS fs, const char *path); + + + /** + * hdfsSeek - Seek to given offset in file. + * This works only for files opened in read-only mode. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param desiredPos Offset into the file to seek into. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); + + + /** + * hdfsTell - Get the current offset in the file, in bytes. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Current offset, -1 on error. + */ + tOffset hdfsTell(hdfsFS fs, hdfsFile file); + + + /** + * hdfsRead - Read data from an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param buffer The buffer to copy read bytes into. + * @param length The length of the buffer. + * @return Returns the number of bytes actually read, possibly less + * than than length;-1 on error. + */ + tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length); + + + /** + * hdfsPread - Positional read of data from an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param position Position from which to read + * @param buffer The buffer to copy read bytes into. + * @param length The length of the buffer. + * @return Returns the number of bytes actually read, possibly less than + * than length;-1 on error. + */ + tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, + void* buffer, tSize length); + + + /** + * hdfsWrite - Write data into an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param buffer The data. + * @param length The no. of bytes to write. + * @return Returns the number of bytes written, -1 on error. + */ + tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer, + tSize length); + + + /** + * hdfsWrite - Flush the data. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsFlush(hdfsFS fs, hdfsFile file); + + /** + * hdfsSync - Sync the data to persistent store. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSync(hdfsFS fs, hdfsFile file); + + /** + * hdfsGetNumReplicasInPipeline - get number of remaining replicas in + * pipeline + * @param fs The configured filesystem handle + * @param file the file handle + * @return returns the # of datanodes in the write pipeline; -1 on error + */ + int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file); + + /** + * hdfsAvailable - Number of bytes that can be read from this + * input stream without blocking. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns available bytes; -1 on error. + */ + int hdfsAvailable(hdfsFS fs, hdfsFile file); + + + /** + * hdfsCopy - Copy file from one filesystem to another. + * @param srcFS The handle to source filesystem. + * @param src The path of source file. + * @param dstFS The handle to destination filesystem. + * @param dst The path of destination file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + + /** + * hdfsMove - Move file from one filesystem to another. + * @param srcFS The handle to source filesystem. + * @param src The path of source file. + * @param dstFS The handle to destination filesystem. + * @param dst The path of destination file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + + /** + * hdfsDelete - Delete file. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsDelete(hdfsFS fs, const char* path); + + + /** + * hdfsRename - Rename file. + * @param fs The configured filesystem handle. + * @param oldPath The path of the source file. + * @param newPath The path of the destination file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath); + + + /** + * hdfsGetWorkingDirectory - Get the current working directory for + * the given filesystem. + * @param fs The configured filesystem handle. + * @param buffer The user-buffer to copy path of cwd into. + * @param bufferSize The length of user-buffer. + * @return Returns buffer, NULL on error. + */ + char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize); + + + /** + * hdfsSetWorkingDirectory - Set the working directory. All relative + * paths will be resolved relative to it. + * @param fs The configured filesystem handle. + * @param path The path of the new 'cwd'. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSetWorkingDirectory(hdfsFS fs, const char* path); + + + /** + * hdfsCreateDirectory - Make the given file and all non-existent + * parents into directories. + * @param fs The configured filesystem handle. + * @param path The path of the directory. + * @return Returns 0 on success, -1 on error. + */ + int hdfsCreateDirectory(hdfsFS fs, const char* path); + + + /** + * hdfsSetReplication - Set the replication of the specified + * file to the supplied value + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns 0 on success, -1 on error. + */ + int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication); + + + /** + * hdfsFileInfo - Information about a file/directory. + */ + typedef struct { + tObjectKind mKind; /* file or directory */ + char *mName; /* the name of the file */ + tTime mLastMod; /* the last modification time for the file in seconds */ + tOffset mSize; /* the size of the file in bytes */ + short mReplication; /* the count of replicas */ + tOffset mBlockSize; /* the block size for the file */ + char *mOwner; /* the owner of the file */ + char *mGroup; /* the group associated with the file */ + short mPermissions; /* the permissions associated with the file */ + tTime mLastAccess; /* the last access time for the file in seconds */ + } hdfsFileInfo; + + + /** + * hdfsListDirectory - Get list of files/directories for a given + * directory-path. hdfsFreeFileInfo should be called to deallocate memory if + * the function returns non-NULL value. + * @param fs The configured filesystem handle. + * @param path The path of the directory. + * @param numEntries Set to the number of files/directories in path. + * @return Returns a dynamically-allocated array of hdfsFileInfo + * objects; NULL if empty or on error. + * on error, numEntries will be -1. + */ + hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path, + int *numEntries); + + + /** + * hdfsGetPathInfo - Get information about a path as a (dynamically + * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be + * called when the pointer is no longer needed. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns a dynamically-allocated hdfsFileInfo object; + * NULL on error. + */ + hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path); + + + /** + * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) + * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo + * objects. + * @param numEntries The size of the array. + */ + void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries); + + + /** + * hdfsGetHosts - Get hostnames where a particular block (determined by + * pos & blocksize) of a file is stored. The last element in the array + * is NULL. Due to replication, a single block could be present on + * multiple hosts. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @param start The start of the block. + * @param length The length of the block. + * @return Returns a dynamically-allocated 2-d array of blocks-hosts; + * NULL on error. + */ + char*** hdfsGetHosts(hdfsFS fs, const char* path, + tOffset start, tOffset length); + + + /** + * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts + * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo + * objects. + * @param numEntries The size of the array. + */ + void hdfsFreeHosts(char ***blockHosts); + + + /** + * hdfsGetDefaultBlockSize - Get the optimum blocksize. + * @param fs The configured filesystem handle. + * @return Returns the blocksize; -1 on error. + */ + tOffset hdfsGetDefaultBlockSize(hdfsFS fs); + + + /** + * hdfsGetCapacity - Return the raw capacity of the filesystem. + * @param fs The configured filesystem handle. + * @return Returns the raw-capacity; -1 on error. + */ + tOffset hdfsGetCapacity(hdfsFS fs); + + + /** + * hdfsGetUsed - Return the total raw size of all files in the filesystem. + * @param fs The configured filesystem handle. + * @return Returns the total-size; -1 on error. + */ + tOffset hdfsGetUsed(hdfsFS fs); + + /** + * hdfsChown + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param owner this is a string in Hadoop land. Set to null or "" if only setting group + * @param group this is a string in Hadoop land. Set to null or "" if only setting user + * @return 0 on success else -1 + */ + int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group); + + /** + * hdfsChmod + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param mode the bitmask to set it to + * @return 0 on success else -1 + */ + int hdfsChmod(hdfsFS fs, const char* path, short mode); + + /** + * hdfsUtime + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param mtime new modification time or 0 for only set access time in seconds + * @param atime new access time or 0 for only set modification time in seconds + * @return 0 on success else -1 + */ + int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime); + +#ifdef __cplusplus +} +#endif + +#endif /*LIBHDFS_HDFS_H*/ + +/** + * vim: ts=4: sw=4: et + */ diff --git a/hdfs/libhdfs.a b/hdfs/libhdfs.a new file mode 100644 index 00000000..4d1f19f0 Binary files /dev/null and b/hdfs/libhdfs.a differ diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc new file mode 100644 index 00000000..34d97fa7 --- /dev/null +++ b/helpers/memenv/memenv.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "helpers/memenv/memenv.h" + +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include +#include +#include +#include + +namespace rocksdb { + +namespace { + +class FileState { + public: + // FileStates are reference counted. The initial reference count is zero + // and the caller must call Ref() at least once. + FileState() : refs_(0), size_(0) {} + + // Increase the reference count. + void Ref() { + MutexLock lock(&refs_mutex_); + ++refs_; + } + + // Decrease the reference count. Delete if this is the last reference. + void Unref() { + bool do_delete = false; + + { + MutexLock lock(&refs_mutex_); + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + do_delete = true; + } + } + + if (do_delete) { + delete this; + } + } + + uint64_t Size() const { return size_; } + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + if (offset > size_) { + return Status::IOError("Offset greater than file size."); + } + const uint64_t available = size_ - offset; + if (n > available) { + n = available; + } + if (n == 0) { + *result = Slice(); + return Status::OK(); + } + + size_t block = offset / kBlockSize; + size_t block_offset = offset % kBlockSize; + + if (n <= kBlockSize - block_offset) { + // The requested bytes are all in the first block. + *result = Slice(blocks_[block] + block_offset, n); + return Status::OK(); + } + + size_t bytes_to_copy = n; + char* dst = scratch; + + while (bytes_to_copy > 0) { + size_t avail = kBlockSize - block_offset; + if (avail > bytes_to_copy) { + avail = bytes_to_copy; + } + memcpy(dst, blocks_[block] + block_offset, avail); + + bytes_to_copy -= avail; + dst += avail; + block++; + block_offset = 0; + } + + *result = Slice(scratch, n); + return Status::OK(); + } + + Status Append(const Slice& data) { + const char* src = data.data(); + size_t src_len = data.size(); + + while (src_len > 0) { + size_t avail; + size_t offset = size_ % kBlockSize; + + if (offset != 0) { + // There is some room in the last block. + avail = kBlockSize - offset; + } else { + // No room in the last block; push new one. + blocks_.push_back(new char[kBlockSize]); + avail = kBlockSize; + } + + if (avail > src_len) { + avail = src_len; + } + memcpy(blocks_.back() + offset, src, avail); + src_len -= avail; + src += avail; + size_ += avail; + } + + return Status::OK(); + } + + private: + // Private since only Unref() should be used to delete it. + ~FileState() { + for (std::vector::iterator i = blocks_.begin(); i != blocks_.end(); + ++i) { + delete [] *i; + } + } + + // No copying allowed. + FileState(const FileState&); + void operator=(const FileState&); + + port::Mutex refs_mutex_; + int refs_; // Protected by refs_mutex_; + + // The following fields are not protected by any mutex. They are only mutable + // while the file is being written, and concurrent access is not allowed + // to writable files. + std::vector blocks_; + uint64_t size_; + + enum { kBlockSize = 8 * 1024 }; +}; + +class SequentialFileImpl : public SequentialFile { + public: + explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) { + file_->Ref(); + } + + ~SequentialFileImpl() { + file_->Unref(); + } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s = file_->Read(pos_, n, result, scratch); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (pos_ > file_->Size()) { + return Status::IOError("pos_ > file_->Size()"); + } + const size_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += n; + return Status::OK(); + } + + private: + FileState* file_; + size_t pos_; +}; + +class RandomAccessFileImpl : public RandomAccessFile { + public: + explicit RandomAccessFileImpl(FileState* file) : file_(file) { + file_->Ref(); + } + + ~RandomAccessFileImpl() { + file_->Unref(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return file_->Read(offset, n, result, scratch); + } + + private: + FileState* file_; +}; + +class WritableFileImpl : public WritableFile { + public: + WritableFileImpl(FileState* file) : file_(file) { + file_->Ref(); + } + + ~WritableFileImpl() { + file_->Unref(); + } + + virtual Status Append(const Slice& data) { + return file_->Append(data); + } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + private: + FileState* file_; +}; + +class InMemoryDirectory : public Directory { + public: + virtual Status Fsync() { return Status::OK(); } +}; + +class InMemoryEnv : public EnvWrapper { + public: + explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } + + virtual ~InMemoryEnv() { + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){ + i->second->Unref(); + } + } + + // Partial implementation of the Env interface. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + *result = NULL; + return Status::IOError(fname, "File not found"); + } + + result->reset(new SequentialFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + *result = NULL; + return Status::IOError(fname, "File not found"); + } + + result->reset(new RandomAccessFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) != file_map_.end()) { + DeleteFileInternal(fname); + } + + FileState* file = new FileState(); + file->Ref(); + file_map_[fname] = file; + + result->reset(new WritableFileImpl(file)); + return Status::OK(); + } + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + result->reset(new InMemoryDirectory()); + return Status::OK(); + } + + virtual bool FileExists(const std::string& fname) { + MutexLock lock(&mutex_); + return file_map_.find(fname) != file_map_.end(); + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + MutexLock lock(&mutex_); + result->clear(); + + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){ + const std::string& filename = i->first; + + if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' && + Slice(filename).starts_with(Slice(dir))) { + result->push_back(filename.substr(dir.size() + 1)); + } + } + + return Status::OK(); + } + + void DeleteFileInternal(const std::string& fname) { + if (file_map_.find(fname) == file_map_.end()) { + return; + } + + file_map_[fname]->Unref(); + file_map_.erase(fname); + } + + virtual Status DeleteFile(const std::string& fname) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + DeleteFileInternal(fname); + return Status::OK(); + } + + virtual Status CreateDir(const std::string& dirname) { + return Status::OK(); + } + + virtual Status CreateDirIfMissing(const std::string& dirname) { + return Status::OK(); + } + + virtual Status DeleteDir(const std::string& dirname) { + return Status::OK(); + } + + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) { + MutexLock lock(&mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + *file_size = file_map_[fname]->Size(); + return Status::OK(); + } + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time) { + return Status::NotSupported("getFileMTime", "Not supported in MemEnv"); + } + + virtual Status RenameFile(const std::string& src, + const std::string& target) { + MutexLock lock(&mutex_); + if (file_map_.find(src) == file_map_.end()) { + return Status::IOError(src, "File not found"); + } + + DeleteFileInternal(target); + file_map_[target] = file_map_[src]; + file_map_.erase(src); + return Status::OK(); + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = new FileLock; + return Status::OK(); + } + + virtual Status UnlockFile(FileLock* lock) { + delete lock; + return Status::OK(); + } + + virtual Status GetTestDirectory(std::string* path) { + *path = "/test"; + return Status::OK(); + } + + private: + // Map from filenames to FileState objects, representing a simple file system. + typedef std::map FileSystem; + port::Mutex mutex_; + FileSystem file_map_; // Protected by mutex_. +}; + +} // namespace + +Env* NewMemEnv(Env* base_env) { + return new InMemoryEnv(base_env); +} + +} // namespace rocksdb diff --git a/helpers/memenv/memenv.h b/helpers/memenv/memenv.h new file mode 100644 index 00000000..21264411 --- /dev/null +++ b/helpers/memenv/memenv.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_ +#define STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_ +namespace rocksdb { + +class Env; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_ diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc new file mode 100644 index 00000000..19fc8ff2 --- /dev/null +++ b/helpers/memenv/memenv_test.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "helpers/memenv/memenv.h" + +#include "db/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/testharness.h" +#include +#include +#include + +namespace rocksdb { + +class MemEnvTest { + public: + Env* env_; + const EnvOptions soptions_; + + MemEnvTest() + : env_(NewMemEnv(Env::Default())) { + } + ~MemEnvTest() { + delete env_; + } +}; + +TEST(MemEnvTest, Basics) { + uint64_t file_size; + unique_ptr writable_file; + std::vector children; + + ASSERT_OK(env_->CreateDir("/dir")); + + // Check that the directory is empty. + ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); + ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + writable_file.reset(); + + // Check that the file exists. + ASSERT_TRUE(env_->FileExists("/dir/f")); + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + + // Write to the file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); + ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/f")); + ASSERT_TRUE(env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that opening non-existent file fails. + unique_ptr seq_file; + unique_ptr rand_file; + ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file, + soptions_).ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file, + soptions_).ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); + ASSERT_OK(env_->DeleteFile("/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + ASSERT_OK(env_->DeleteDir("/dir")); +} + +TEST(MemEnvTest, ReadWrite) { + unique_ptr writable_file; + unique_ptr seq_file; + unique_ptr rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->CreateDir("/dir")); + + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST(MemEnvTest, Locks) { + FileLock* lock; + + // These are no-ops, but we test they return success. + ASSERT_OK(env_->LockFile("some file", &lock)); + ASSERT_OK(env_->UnlockFile(lock)); +} + +TEST(MemEnvTest, Misc) { + std::string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + ASSERT_TRUE(!test_dir.empty()); + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST(MemEnvTest, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + writable_file.reset(); + + unique_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete [] scratch; +} + +TEST(MemEnvTest, DBTest) { + Options options; + options.create_if_missing = true; + options.env = env_; + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h new file mode 100644 index 00000000..7d4a374d --- /dev/null +++ b/include/rocksdb/c.h @@ -0,0 +1,573 @@ +/* Copyright (c) 2013, Facebook, Inc. All rights reserved. + This source code is licensed under the BSD-style license found in the + LICENSE file in the root directory of this source tree. An additional grant + of patent rights can be found in the PATENTS file in the same directory. + Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. + + C bindings for leveldb. May be useful as a stable ABI that can be + used by programs that keep leveldb in a shared library, or for + a JNI api. + + Does not support: + . getters for the option types + . custom comparators that implement key shortening + . capturing post-write-snapshot + . custom iter, db, env, cache implementations using just the C bindings + + Some conventions: + + (1) We expose just opaque struct pointers and functions to clients. + This allows us to change internal representations without having to + recompile clients. + + (2) For simplicity, there is no equivalent to the Slice type. Instead, + the caller has to pass the pointer and length as separate + arguments. + + (3) Errors are represented by a null-terminated c string. NULL + means no error. All operations that can raise an error are passed + a "char** errptr" as the last argument. One of the following must + be true on entry: + *errptr == NULL + *errptr points to a malloc()ed null-terminated error message + On success, a leveldb routine leaves *errptr unchanged. + On failure, leveldb frees the old value of *errptr and + set *errptr to a malloc()ed error message. + + (4) Bools have the type unsigned char (0 == false; rest == true) + + (5) All of the pointer arguments must be non-NULL. +*/ + +#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_ +#define STORAGE_ROCKSDB_INCLUDE_C_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* Exported types */ + +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; +typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; + +/* DB operations */ + +extern rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +extern void rocksdb_close(rocksdb_t* db); + +extern void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr); + +extern void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr); + +extern void rocksdb_merge( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr); + +extern void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr); + +/* Returns NULL if not found. A malloc()ed array otherwise. + Stores the length of the array in *vallen. */ +extern char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr); + +extern rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options); + +extern const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db); + +extern void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot); + +/* Returns NULL if property name is unknown. + Else returns a pointer to a malloc()-ed null-terminated value. */ +extern char* rocksdb_property_value( + rocksdb_t* db, + const char* propname); + +extern void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes); + +extern void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +extern void rocksdb_delete_file( + rocksdb_t* db, + const char* name); + +extern const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db); + +extern void rocksdb_flush( + rocksdb_t* db, + const rocksdb_flushoptions_t* options, + char** errptr); + +extern void rocksdb_disable_file_deletions( + rocksdb_t* db, + char** errptr); + +extern void rocksdb_enable_file_deletions( + rocksdb_t* db, + unsigned char force, + char** errptr); + +/* Management operations */ + +extern void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +extern void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr); + +/* Iterator */ + +extern void rocksdb_iter_destroy(rocksdb_iterator_t*); +extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*); +extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*); +extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen); +extern void rocksdb_iter_next(rocksdb_iterator_t*); +extern void rocksdb_iter_prev(rocksdb_iterator_t*); +extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen); +extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen); +extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr); + +/* Write batch */ + +extern rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*); +extern int rocksdb_writebatch_count(rocksdb_writebatch_t*); +extern void rocksdb_writebatch_put( + rocksdb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen); +extern void rocksdb_writebatch_merge( + rocksdb_writebatch_t*, + const char* key, size_t klen, + const char* val, size_t vlen); +extern void rocksdb_writebatch_delete( + rocksdb_writebatch_t*, + const char* key, size_t klen); +extern void rocksdb_writebatch_iterate( + rocksdb_writebatch_t*, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)); +extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size); + +/* Options */ + +extern rocksdb_options_t* rocksdb_options_create(); +extern void rocksdb_options_destroy(rocksdb_options_t*); +extern void rocksdb_options_set_comparator( + rocksdb_options_t*, + rocksdb_comparator_t*); +extern void rocksdb_options_set_merge_operator(rocksdb_options_t*, + rocksdb_mergeoperator_t*); +extern void rocksdb_options_set_compression_per_level( + rocksdb_options_t* opt, + int* level_values, + size_t num_levels); +extern void rocksdb_options_set_filter_policy( + rocksdb_options_t*, + rocksdb_filterpolicy_t*); +extern void rocksdb_options_set_create_if_missing( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_error_if_exists( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_paranoid_checks( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); +extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int); +extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); +extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*); +extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int); +extern void rocksdb_options_set_compression_options( + rocksdb_options_t*, int, int, int); +extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_prefix_extractor( + rocksdb_options_t*, rocksdb_slicetransform_t*); +extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_mem_compaction_level( + rocksdb_options_t*, int); +extern void rocksdb_options_set_target_file_size_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_max_bytes_for_level_multiplier( + rocksdb_options_t*, int); +extern void rocksdb_options_set_expanded_compaction_factor( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_grandparent_overlap_factor( + rocksdb_options_t*, int); +extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t*, int* level_values, size_t num_levels); +extern void rocksdb_options_enable_statistics(rocksdb_options_t*); + +extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); +extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double); +extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double); +extern void rocksdb_options_set_rate_limit_delay_max_milliseconds( + rocksdb_options_t*, unsigned int); +extern void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_no_block_cache( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t*, int); +extern void rocksdb_options_set_table_cache_remove_scan_count_limit( + rocksdb_options_t*, int); +extern void rocksdb_options_set_arena_block_size( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_use_fsync( + rocksdb_options_t*, int); +extern void rocksdb_options_set_db_stats_log_interval( + rocksdb_options_t*, int); +extern void rocksdb_options_set_db_log_dir( + rocksdb_options_t*, const char*); +extern void rocksdb_options_set_wal_dir( + rocksdb_options_t*, const char*); +extern void rocksdb_options_set_WAL_ttl_seconds( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_purge_redundant_kvs_while_flush( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_allow_os_buffer( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_skip_log_error_on_recovery( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t*, unsigned int); +extern void rocksdb_options_set_block_size_deviation( + rocksdb_options_t*, int); +extern void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t*, int); +extern void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_verify_checksums_in_compaction( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_filter_deletes( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); +extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int); +extern void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); +extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*); +extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); +extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t); +extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t); + +extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n); +extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec); + +extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level); + +extern void rocksdb_options_set_memtable_prefix_bloom_bits( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_memtable_prefix_bloom_probes( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_max_successive_merges( + rocksdb_options_t*, size_t); +extern void rocksdb_options_set_min_partial_merge_operands( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_bloom_locality( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_allow_thread_local( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_inplace_update_support( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t*, size_t); + +enum { + rocksdb_no_compression = 0, + rocksdb_snappy_compression = 1, + rocksdb_zlib_compression = 2, + rocksdb_bz2_compression = 3, + rocksdb_lz4_compression = 4, + rocksdb_lz4hc_compression = 5 +}; +extern void rocksdb_options_set_compression(rocksdb_options_t*, int); + +enum { + rocksdb_level_compaction = 0, + rocksdb_universal_compaction = 1 +}; +extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int); +extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*); +/* Comparator */ + +extern rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)); +extern void rocksdb_comparator_destroy(rocksdb_comparator_t*); + +/* Filter policy */ + +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + void (*delete_filter)( + void*, + const char* filter, size_t filter_length), + const char* (*name)(void*)); +extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*); + +extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom( + int bits_per_key); + +/* Merge Operator */ + +extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, + void (*destructor)(void*), + char* (*full_merge)( + void*, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)( + void*, + const char* key, size_t key_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)( + void*, + const char* value, size_t value_length), + const char* (*name)(void*)); +extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*); + +/* Read options */ + +extern rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*); +extern void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t*, + unsigned char); +extern void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t*, unsigned char); +extern void rocksdb_readoptions_set_prefix_seek( + rocksdb_readoptions_t*, unsigned char); +extern void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t*, + const rocksdb_snapshot_t*); +extern void rocksdb_readoptions_set_prefix( + rocksdb_readoptions_t*, const char* key, size_t keylen); +extern void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t*, int); +extern void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t*, unsigned char); + +/* Write options */ + +extern rocksdb_writeoptions_t* rocksdb_writeoptions_create(); +extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*); +extern void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t*, unsigned char); +extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable); + +/* Flush options */ + +extern rocksdb_flushoptions_t* rocksdb_flushoptions_create(); +extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*); +extern void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t*, unsigned char); + +/* Cache */ + +extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity); +extern void rocksdb_cache_destroy(rocksdb_cache_t* cache); + +/* Env */ + +extern rocksdb_env_t* rocksdb_create_default_env(); +extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern void rocksdb_env_destroy(rocksdb_env_t*); + +/* SliceTransform */ + +extern rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, + void (*destructor)(void*), + char* (*transform)( + void*, + const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)( + void*, + const char* key, size_t length), + unsigned char (*in_range)( + void*, + const char* key, size_t length), + const char* (*name)(void*)); +extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t); +extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*); + +/* Universal Compaction options */ + +enum { + rocksdb_similar_size_compaction_stop_style = 0, + rocksdb_total_size_compaction_stop_style = 1 +}; + +extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ; +extern void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t*, int); +extern void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t*); + +extern int rocksdb_livefiles_count( + const rocksdb_livefiles_t*); +extern const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t*, + int index); +extern int rocksdb_livefiles_level( + const rocksdb_livefiles_t*, + int index); +extern size_t rocksdb_livefiles_size( + const rocksdb_livefiles_t*, + int index); +extern const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t*, + int index, + size_t* size); +extern const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t*, + int index, + size_t* size); +extern void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t*); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif + +#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */ diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h new file mode 100644 index 00000000..b5821bac --- /dev/null +++ b/include/rocksdb/cache.h @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// A builtin cache implementation with a least-recently-used eviction +// policy is provided. Clients may use their own implementations if +// they want something more sophisticated (like scan-resistance, a +// custom eviction policy, variable cache sizing, etc.) + +#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_ +#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_ + +#include +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +using std::shared_ptr; + +class Cache; + +// Create a new cache with a fixed size capacity. The cache is sharded +// to 2^numShardBits shards, by hash of the key. The total capacity +// is divided and evenly assigned to each shard. Inside each shard, +// the eviction is done in two passes: first try to free spaces by +// evicting entries that are among the most least used removeScanCountLimit +// entries and do not have reference other than by the cache itself, in +// the least-used order. If not enough space is freed, further free the +// entries in least used order. +// +// The functions without parameter numShardBits and/or removeScanCountLimit +// use default values. removeScanCountLimit's default value is 0, which +// means a strict LRU order inside each shard. +extern shared_ptr NewLRUCache(size_t capacity); +extern shared_ptr NewLRUCache(size_t capacity, int numShardBits); +extern shared_ptr NewLRUCache(size_t capacity, int numShardBits, + int removeScanCountLimit); + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) = 0; + + // If the cache has no mapping for "key", returns nullptr. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // returns the maximum configured capacity of the cache + virtual size_t GetCapacity() const = 0; + + // returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; + + // Call this on shutdown if you want to speed it up. Cache will disown + // any underlying data and will not free it on delete. This call will leak + // memory - call this only if you're shutting down the process. + // Any attempts of using cache after this call will fail terribly. + // Always delete the DB object before calling this method! + virtual void DisownData() { + // default implementation is noop + }; + + private: + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; + + // No copying allowed + Cache(const Cache&); + void operator=(const Cache&); +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_ diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h new file mode 100644 index 00000000..59b05092 --- /dev/null +++ b/include/rocksdb/compaction_filter.h @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2013 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ +#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ + +#include +#include + +namespace rocksdb { + +class Slice; +class SliceTransform; + +// Context information of a compaction run +struct CompactionFilterContext { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; +}; + +// CompactionFilter allows an application to modify/delete a key-value at +// the time of compaction. + +class CompactionFilter { + public: + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + }; + + virtual ~CompactionFilter() {} + + // The compaction process invokes this + // method for kv that is being compacted. A return value + // of false indicates that the kv should be preserved in the + // output of this compaction run and a return value of true + // indicates that this key-value should be removed from the + // output of the compaction. The application can inspect + // the existing value of the key and make decision based on it. + // + // When the value is to be preserved, the application has the option + // to modify the existing_value and pass it back through new_value. + // value_changed needs to be set to true in this case. + // + // If multithreaded compaction is being used *and* a single CompactionFilter + // instance was supplied via Options::compaction_filter, this method may be + // called from different threads concurrently. The application must ensure + // that the call is thread-safe. + // + // If the CompactionFilter was created by a factory, then it will only ever + // be used by a single thread that is doing the compaction run, and this + // call does not need to be thread-safe. However, multiple filters may be + // in existence and operating concurrently. + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const = 0; + + // Returns a name that identifies this compaction filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let +// application layer to make individual decisions for all the kv pairs in the +// buffer. +class CompactionFilterV2 { + public: + virtual ~CompactionFilterV2() {} + + // The compaction process invokes this method for all the kv pairs + // sharing the same prefix. It is a "roll-up" version of CompactionFilter. + // + // Each entry in the return vector indicates if the corresponding kv should + // be preserved in the output of this compaction run. The application can + // inspect the exisitng values of the keys and make decision based on it. + // + // When a value is to be preserved, the application has the option + // to modify the entry in existing_values and pass it back through an entry + // in new_values. A corresponding values_changed entry needs to be set to + // true in this case. Note that the new_values vector contains only changed + // values, i.e. new_values.size() <= values_changed.size(). + // + typedef std::vector SliceVector; + virtual std::vector Filter(int level, + const SliceVector& keys, + const SliceVector& existing_values, + std::vector* new_values, + std::vector* values_changed) + const = 0; + + // Returns a name that identifies this compaction filter. + // The name will be printed to LOG file on start up for diagnosis. + virtual const char* Name() const = 0; +}; + +// Each compaction will create a new CompactionFilter allowing the +// application to know about different campactions +class CompactionFilterFactory { + public: + virtual ~CompactionFilterFactory() { } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) = 0; + + // Returns a name that identifies this compaction filter factory. + virtual const char* Name() const = 0; +}; + +// Default implementaion of CompactionFilterFactory which does not +// return any filter +class DefaultCompactionFilterFactory : public CompactionFilterFactory { + public: + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr(nullptr); + } + + virtual const char* Name() const override { + return "DefaultCompactionFilterFactory"; + } +}; + +// Each compaction will create a new CompactionFilterV2 +// +// CompactionFilterFactoryV2 enables application to specify a prefix and use +// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all +// the kv-pairs sharing the same prefix. +// +// This is useful for applications that require grouping kv-pairs in +// compaction filter to make a purge/no-purge decision. For example, if the +// key prefix is user id and the rest of key represents the type of value. +// This batching filter will come in handy if the application's compaction +// filter requires knowledge of all types of values for any user id. +// +class CompactionFilterFactoryV2 { + public: + // NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor + explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor) + : prefix_extractor_(prefix_extractor) { } + + virtual ~CompactionFilterFactoryV2() { } + + virtual std::unique_ptr CreateCompactionFilterV2( + const CompactionFilterContext& context) = 0; + + // Returns a name that identifies this compaction filter factory. + virtual const char* Name() const = 0; + + const SliceTransform* GetPrefixExtractor() const { + return prefix_extractor_; + } + + void SetPrefixExtractor(const SliceTransform* prefix_extractor) { + prefix_extractor_ = prefix_extractor; + } + + private: + // Prefix extractor for compaction filter v2 + // Keys sharing the same prefix will be buffered internally. + // Client can implement a Filter callback function to operate on the buffer + const SliceTransform* prefix_extractor_; +}; + +// Default implementaion of CompactionFilterFactoryV2 which does not +// return any filter +class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 { + public: + explicit DefaultCompactionFilterFactoryV2() + : CompactionFilterFactoryV2(nullptr) { } + + virtual std::unique_ptr + CreateCompactionFilterV2( + const CompactionFilterContext& context) override { + return std::unique_ptr(nullptr); + } + + virtual const char* Name() const override { + return "DefaultCompactionFilterFactoryV2"; + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h new file mode 100644 index 00000000..f3a8499a --- /dev/null +++ b/include/rocksdb/comparator.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ + +#include + +namespace rocksdb { + +class Slice; + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. A Comparator implementation +// must be thread-safe since rocksdb may invoke its methods concurrently +// from multiple threads. +class Comparator { + public: + virtual ~Comparator(); + + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + virtual int Compare(const Slice& a, const Slice& b) const = 0; + + // The name of the comparator. Used to check for comparator + // mismatches (i.e., a DB created with one comparator is + // accessed using a different comparator. + // + // The client of this package should switch to a new name whenever + // the comparator implementation changes in a way that will cause + // the relative ordering of any two keys to change. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Advanced functions: these are used to reduce the space requirements + // for internal data structures like index blocks. + + // If *start < limit, changes *start to a short string in [start,limit). + // Simple comparator implementations may return with *start unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const = 0; + + // Changes *key to a short string >= *key. + // Simple comparator implementations may return with *key unchanged, + // i.e., an implementation of this method that does nothing is correct. + virtual void FindShortSuccessor(std::string* key) const = 0; +}; + +// Return a builtin comparator that uses lexicographic byte-wise +// ordering. The result remains the property of this module and +// must not be deleted. +extern const Comparator* BytewiseComparator(); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h new file mode 100644 index 00000000..1f8d2f37 --- /dev/null +++ b/include/rocksdb/db.h @@ -0,0 +1,342 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_ +#define STORAGE_ROCKSDB_INCLUDE_DB_H_ + +#include +#include +#include +#include +#include +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" + +namespace rocksdb { + +using std::unique_ptr; + +// Update Makefile if you change these +static const int kMajorVersion = 2; +static const int kMinorVersion = 0; + +struct Options; +struct ReadOptions; +struct WriteOptions; +struct FlushOptions; +struct TableProperties; +class WriteBatch; +class Env; + +// Metadata associated with each SST file. +struct LiveFileMetaData { + std::string name; // Name of the file + int level; // Level at which this file resides. + size_t size; // File size in bytes. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + SequenceNumber smallest_seqno; // smallest seqno in file + SequenceNumber largest_seqno; // largest seqno in file +}; + +// Abstract handle to particular state of a DB. +// A Snapshot is an immutable object and can therefore be safely +// accessed from multiple threads without any external synchronization. +class Snapshot { + protected: + virtual ~Snapshot(); +}; + +// A range of keys +struct Range { + Slice start; // Included in the range + Slice limit; // Not included in the range + + Range() { } + Range(const Slice& s, const Slice& l) : start(s), limit(l) { } +}; + +// A collections of table properties objects, where +// key: is the table's file name. +// value: the table properties object of the given table. +typedef std::unordered_map> + TablePropertiesCollection; + +// A DB is a persistent ordered map from keys to values. +// A DB is safe for concurrent access from multiple threads without +// any external synchronization. +class DB { + public: + // Open the database with the specified "name". + // Stores a pointer to a heap-allocated database in *dbptr and returns + // OK on success. + // Stores nullptr in *dbptr and returns a non-OK status on error. + // Caller should delete *dbptr when it is no longer needed. + static Status Open(const Options& options, + const std::string& name, + DB** dbptr); + + // Open the database for read only. All DB interfaces + // that modify data, like put/delete, will return error. + // If the db is opened in read only mode, then no compactions + // will happen. + static Status OpenForReadOnly(const Options& options, + const std::string& name, DB** dbptr, + bool error_if_log_file_exist = false); + + DB() { } + virtual ~DB(); + + // Set the database entry for "key" to "value". + // Returns OK on success, and a non-OK status on error. + // Note: consider setting options.sync = true. + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Remove the database entry (if any) for "key". Returns OK on + // success, and a non-OK status on error. It is not an error if "key" + // did not exist in the database. + // Note: consider setting options.sync = true. + virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + + // Merge the database entry for "key" with "value". Returns OK on success, + // and a non-OK status on error. The semantics of this operation is + // determined by the user provided merge_operator when opening DB. + // Note: consider setting options.sync = true. + virtual Status Merge(const WriteOptions& options, + const Slice& key, + const Slice& value) = 0; + + // Apply the specified updates to the database. + // Returns OK on success, non-OK on failure. + // Note: consider setting options.sync = true. + virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. + // + // If there is no entry for "key" leave *value unchanged and return + // a status for which Status::IsNotFound() returns true. + // + // May return some other Status on an error. + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value) = 0; + + // If keys[i] does not exist in the database, then the i'th returned + // status will be one for which Status::IsNotFound() is true, and + // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, + // the i'th returned status will have Status::ok() true, and (*values)[i] + // will store the value associated with keys[i]. + // + // (*values) will always be resized to be the same size as (keys). + // Similarly, the number of returned statuses will be the number of keys. + // Note: keys will not be "de-duplicated". Duplicate keys will return + // duplicate values in order. + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) = 0; + + // If the key definitely does not exist in the database, then this method + // returns false, else true. If the caller wants to obtain value when the key + // is found in memory, a bool for 'value_found' must be passed. 'value_found' + // will be true on return if value has been set properly. + // This check is potentially lighter-weight than invoking DB::Get(). One way + // to make this lighter weight is to avoid doing any IOs. + // Default implementation here returns true and sets 'value_found' to false + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) { + if (value_found != nullptr) { + *value_found = false; + } + return true; + } + + // Return a heap-allocated iterator over the contents of the database. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // + // Caller should delete the iterator when it is no longer needed. + // The returned iterator should be deleted before this db is deleted. + virtual Iterator* NewIterator(const ReadOptions& options) = 0; + + // Return a handle to the current DB state. Iterators created with + // this handle will all observe a stable snapshot of the current DB + // state. The caller must call ReleaseSnapshot(result) when the + // snapshot is no longer needed. + virtual const Snapshot* GetSnapshot() = 0; + + // Release a previously acquired snapshot. The caller must not + // use "snapshot" after this call. + virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; + + // DB implementations can export properties about their state + // via this method. If "property" is a valid property understood by this + // DB implementation, fills "*value" with its current value and returns + // true. Otherwise returns false. + // + // + // Valid property names include: + // + // "rocksdb.num-files-at-level" - return the number of files at level , + // where is an ASCII representation of a level number (e.g. "0"). + // "rocksdb.stats" - returns a multi-line string that describes statistics + // about the internal operation of the DB. + // "rocksdb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. + virtual bool GetProperty(const Slice& property, std::string* value) = 0; + + // For each i in [0,n-1], store in "sizes[i]", the approximate + // file system space used by keys in "[range[i].start .. range[i].limit)". + // + // Note that the returned sizes measure file system space usage, so + // if the user data compresses by a factor of ten, the returned + // sizes will be one-tenth the size of the corresponding user data size. + // + // The results may not include the sizes of recently written data. + virtual void GetApproximateSizes(const Range* range, int n, + uint64_t* sizes) = 0; + + // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==nullptr is treated as a key before all keys in the database. + // end==nullptr is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(nullptr, nullptr); + // Note that after the entire database is compacted, all data are pushed + // down to the last level containing any data. If the total data size + // after compaction is reduced, that level might not be appropriate for + // hosting all the files. In this case, client could set reduce_level + // to true, to move the files back to the minimum level capable of holding + // the data set or a given level (specified by non-negative target_level). + virtual Status CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) = 0; + + // Number of levels used for this DB. + virtual int NumberLevels() = 0; + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. + virtual int MaxMemCompactionLevel() = 0; + + // Number of files in level-0 that would stop writes. + virtual int Level0StopWriteTrigger() = 0; + + // Get DB name -- the exact same name that was provided as an argument to + // DB::Open() + virtual const std::string& GetName() const = 0; + + // Get Env object from the DB + virtual Env* GetEnv() const = 0; + + // Get DB Options that we use + virtual const Options& GetOptions() const = 0; + + // Flush all mem-table data. + virtual Status Flush(const FlushOptions& options) = 0; + + // Prevent file deletions. Compactions will continue to occur, + // but no obsolete files will be deleted. Calling this multiple + // times have the same effect as calling it once. + virtual Status DisableFileDeletions() = 0; + + // Allow compactions to delete obselete files. + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; + + // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup + + // THIS METHOD IS DEPRECATED. Use the GetLiveFilesMetaData to get more + // detailed information on the live files. + // Retrieve the list of all files in the database. The files are + // relative to the dbname and are not absolute paths. The valid size of the + // manifest file is returned in manifest_file_size. The manifest file is an + // ever growing file, but only the portion specified by manifest_file_size is + // valid for this snapshot. + // Setting flush_memtable to true does Flush before recording the live files. + // Setting flush_memtable to false is useful when we don't want to wait for + // flush which may have to wait for compaction to complete taking an + // indeterminate time. But this will have to use GetSortedWalFiles after + // GetLiveFiles to compensate for memtables missed in this snapshot due to the + // absence of Flush, by WAL files to recover the database consistently later + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) = 0; + + // Retrieve the sorted list of all wal files with earliest file first + virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + + // The sequence number of the most recent transaction. + virtual SequenceNumber GetLatestSequenceNumber() const = 0; + + // Sets iter to an iterator that is positioned at a write-batch containing + // seq_number. If the sequence number is non existent, it returns an iterator + // at the first available seq_no after the requested seq_no + // Returns Status::OK if iterator is valid + // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + // use this api, else the WAL files will get + // cleared aggressively and the iterator might keep getting invalid before + // an update is read. + virtual Status GetUpdatesSince( + SequenceNumber seq_number, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& + read_options = TransactionLogIterator::ReadOptions()) = 0; + + // Delete the file name from the db directory and update the internal state to + // reflect that. Supports deletion of sst and log files only. 'name' must be + // path relative to the db directory. eg. 000001.sst, /archive/000003.log + virtual Status DeleteFile(std::string name) = 0; + + // Returns a list of all table files with their level, start key + // and end key + virtual void GetLiveFilesMetaData(std::vector* metadata) {} + + // Sets the globally unique ID created at database creation time by invoking + // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could + // be set properly + virtual Status GetDbIdentity(std::string& identity) = 0; + + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0; + + private: + // No copying allowed + DB(const DB&); + void operator=(const DB&); +}; + +// Destroy the contents of the specified database. +// Be very careful using this method. +Status DestroyDB(const std::string& name, const Options& options); + +// If a DB cannot be opened, you may attempt to call this method to +// resurrect as much of the contents of the database as possible. +// Some data may be lost, so be careful when calling this function +// on a database that contains important information. +Status RepairDB(const std::string& dbname, const Options& options); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_ diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h new file mode 100644 index 00000000..a6442517 --- /dev/null +++ b/include/rocksdb/env.h @@ -0,0 +1,765 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_ +#define STORAGE_ROCKSDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class FileLock; +class Logger; +class RandomAccessFile; +class SequentialFile; +class Slice; +class WritableFile; +class RandomRWFile; +class Directory; +struct Options; + +using std::unique_ptr; +using std::shared_ptr; + + +// Options while opening a file to read/write +struct EnvOptions { + + // construct with default Options + EnvOptions(); + + // construct from Options + explicit EnvOptions(const Options& options); + + // If true, then allow caching of data in environment buffers + bool use_os_buffer = true; + + // If true, then use mmap to read data + bool use_mmap_reads = false; + + // If true, then use mmap to write data + bool use_mmap_writes = true; + + // If true, set the FD_CLOEXEC on open fd. + bool set_fd_cloexec = true; + + // Allows OS to incrementally sync files to disk while they are being + // written, in the background. Issue one request for every bytes_per_sync + // written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync = 0; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; +}; + +class Env { + public: + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to rocksdb and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores nullptr in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) + = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) + = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that both reads and writes to a file on + // specified offsets (random access). If file already exists, + // does not overwrite it. On success, stores a pointer to the + // new file in *result and returns OK. On failure stores nullptr + // in *result and returns non-OK. + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) = 0; + + // Create an object that represents a directory. Will fail if directory + // doesn't exist. If the directory exists, it will open the directory + // and create a new Directory object. + // + // On success, stores a pointer to the new Directory in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. Returns error if directory exists. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + virtual Status CreateDirIfMissing(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the last modification time of fname in *file_mtime. + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) = 0; + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores nullptr in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + enum Priority { LOW, HIGH, TOTAL }; + + // Arrange to run "(*function)(arg)" once in a background thread, in + // the thread pool specified by pri. By default, jobs go to the 'LOW' + // priority thread pool. + + // "function" may run in an unspecified thread. Multiple functions + // added to the same Env may run concurrently in different threads. + // I.e., the caller may not assume that background work items are + // serialized. + virtual void Schedule( + void (*function)(void* arg), + void* arg, + Priority pri = LOW) = 0; + + // Start a new thread, invoking "function(arg)" within the new thread. + // When "function(arg)" returns, the thread will be destroyed. + virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + + // Wait for all threads started by StartThread to terminate. + virtual void WaitForJoin() {} + + // Get thread pool queue length for specific thrad pool. + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + return 0; + } + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Create and return a log file for storing informational messages. + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros + virtual uint64_t NowNanos() { + return NowMicros() * 1000; + } + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the current host name. + virtual Status GetHostName(char* name, uint64_t len) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Get full directory name for this db. + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) = 0; + + // The number of background worker threads of a specific thread pool + // for this environment. 'LOW' is the default pool. + // default number: 1 + virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; + + // Generates a unique id that can be used to identify a db + virtual std::string GenerateUniqueId(); + + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) + const; + + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to eachother by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + // compatibility. + }; + + + enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; + + virtual void Hint(AccessPattern pattern) {} + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) { + } + virtual ~WritableFile(); + + virtual Status Append(const Slice& data) = 0; + virtual Status Close() = 0; + virtual Status Flush() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } + + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() { + return 0; + } + + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + // For documentation, refer to RandomAccessFile::GetUniqueId() + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return 0; // Default implementation to prevent issues with backwards + } + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t offset, size_t length) { + return Status::NotSupported("InvalidateCache not supported."); + } + + protected: + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necesessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(off_t offset, off_t nbytes) { + return Status::OK(); + } + + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// A file abstraction for random reading and writing. +class RandomRWFile { + public: + RandomRWFile() {} + virtual ~RandomRWFile() {} + + // Write data from Slice data to file starting from offset + // Returns IOError on failure, but does not guarantee + // atomicity of a write. Returns OK status on success. + // + // Safe for concurrent use. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const = 0; + virtual Status Close() = 0; // closes the file + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only data. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + + private: + // No copying allowed + RandomRWFile(const RandomRWFile&); + void operator=(const RandomRWFile&); +}; + +// Directory object represents collection of files and implements +// filesystem operations that can be executed on directories. +class Directory { + public: + virtual ~Directory() {} + // Fsync directory + virtual Status Fsync() = 0; +}; + +enum InfoLogLevel : unsigned char { + DEBUG = 0, + INFO, + WARN, + ERROR, + FATAL, + NUM_INFO_LOG_LEVELS, +}; + +// An interface for writing log messages. +class Logger { + public: + enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 }; + explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO) + : log_level_(log_level) {} + virtual ~Logger(); + + // Write an entry to the log file with the specified format. + virtual void Logv(const char* format, va_list ap) = 0; + + // Write an entry to the log file with the specified log level + // and format. Any log with level under the internal log level + // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be + // printed. + void Logv(const InfoLogLevel log_level, const char* format, va_list ap) { + static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", + "ERROR", "FATAL"}; + if (log_level < log_level_) { + return; + } + + if (log_level == INFO) { + // Doesn't print log level if it is INFO level. + // This is to avoid unexpected performance regression after we add + // the feature of log level. All the logs before we add the feature + // are INFO level. We don't want to add extra costs to those existing + // logging. + Logv(format, ap); + } else { + char new_format[500]; + snprintf(new_format, sizeof(new_format) - 1, "[%s] %s", + kInfoLogLevelNames[log_level], format); + Logv(new_format, ap); + } + } + virtual size_t GetLogFileSize() const { + return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE; + } + // Flush to the OS buffers + virtual void Flush() {} + virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; } + virtual void SetInfoLogLevel(const InfoLogLevel log_level) { + log_level_ = log_level; + } + + private: + // No copying allowed + Logger(const Logger&); + void operator=(const Logger&); + InfoLogLevel log_level_; +}; + + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +extern void LogFlush(const shared_ptr& info_log); + +extern void Log(const InfoLogLevel log_level, + const shared_ptr& info_log, const char* format, ...); + +// a set of log functions with different log levels. +extern void Debug(const shared_ptr& info_log, const char* format, ...); +extern void Info(const shared_ptr& info_log, const char* format, ...); +extern void Warn(const shared_ptr& info_log, const char* format, ...); +extern void Error(const shared_ptr& info_log, const char* format, ...); +extern void Fatal(const shared_ptr& info_log, const char* format, ...); + +// Log the specified data to *info_log if info_log is non-nullptr. +// The default info log level is InfoLogLevel::ERROR. +extern void Log(const shared_ptr& info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +extern void LogFlush(Logger *info_log); + +extern void Log(const InfoLogLevel log_level, Logger* info_log, + const char* format, ...); + +// The default info log level is InfoLogLevel::ERROR. +extern void Log(Logger* info_log, const char* format, ...) +# if defined(__GNUC__) || defined(__clang__) + __attribute__((__format__ (__printf__, 2, 3))) +# endif + ; + +// a set of log functions with different log levels. +extern void Debug(Logger* info_log, const char* format, ...); +extern void Info(Logger* info_log, const char* format, ...); +extern void Warn(Logger* info_log, const char* format, ...); +extern void Error(Logger* info_log, const char* format, ...); +extern void Fatal(Logger* info_log, const char* format, ...); + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + std::string* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t) : target_(t) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + return target_->NewSequentialFile(f, r, options); + } + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + return target_->NewRandomAccessFile(f, r, options); + } + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + return target_->NewWritableFile(f, r, options); + } + Status NewRandomRWFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + return target_->NewRandomRWFile(f, r, options); + } + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + return target_->NewDirectory(name, result); + } + bool FileExists(const std::string& f) { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) { return target_->CreateDir(d); } + Status CreateDirIfMissing(const std::string& d) { + return target_->CreateDirIfMissing(d); + } + Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); } + Status GetFileSize(const std::string& f, uint64_t* s) { + return target_->GetFileSize(f, s); + } + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + return target_->GetFileModificationTime(fname, file_mtime); + } + + Status RenameFile(const std::string& s, const std::string& t) { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); } + void Schedule(void (*f)(void*), void* a, Priority pri) { + return target_->Schedule(f, a, pri); + } + void StartThread(void (*f)(void*), void* a) { + return target_->StartThread(f, a); + } + void WaitForJoin() { return target_->WaitForJoin(); } + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const { + return target_->GetThreadPoolQueueLen(pri); + } + virtual Status GetTestDirectory(std::string* path) { + return target_->GetTestDirectory(path); + } + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + return target_->NewLogger(fname, result); + } + uint64_t NowMicros() { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) { + target_->SleepForMicroseconds(micros); + } + Status GetHostName(char* name, uint64_t len) { + return target_->GetHostName(name, len); + } + Status GetCurrentTime(int64_t* unix_time) { + return target_->GetCurrentTime(unix_time); + } + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return target_->GetAbsolutePath(db_path, output_path); + } + void SetBackgroundThreads(int num, Priority pri) { + return target_->SetBackgroundThreads(num, pri); + } + std::string TimeToString(uint64_t time) { + return target_->TimeToString(time); + } + + private: + Env* target_; +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_ diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h new file mode 100644 index 00000000..fa44db45 --- /dev/null +++ b/include/rocksdb/filter_policy.h @@ -0,0 +1,74 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A database can be configured with a custom FilterPolicy object. +// This object is responsible for creating a small filter from a set +// of keys. These filters are stored in rocksdb and are consulted +// automatically by rocksdb to decide whether or not to read some +// information from disk. In many cases, a filter can cut down the +// number of disk seeks form a handful to a single disk seek per +// DB::Get() call. +// +// Most people will want to use the builtin bloom filter support (see +// NewBloomFilterPolicy() below). + +#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ +#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ + +#include + +namespace rocksdb { + +class Slice; + +class FilterPolicy { + public: + virtual ~FilterPolicy(); + + // Return the name of this policy. Note that if the filter encoding + // changes in an incompatible way, the name returned by this method + // must be changed. Otherwise, old incompatible filters may be + // passed to methods of this type. + virtual const char* Name() const = 0; + + // keys[0,n-1] contains a list of keys (potentially with duplicates) + // that are ordered according to the user supplied comparator. + // Append a filter that summarizes keys[0,n-1] to *dst. + // + // Warning: do not change the initial contents of *dst. Instead, + // append the newly constructed filter to *dst. + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) + const = 0; + + // "filter" contains the data appended by a preceding call to + // CreateFilter() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; +}; + +// Return a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. A good value for bits_per_key +// is 10, which yields a filter with ~ 1% false positive rate. +// +// Callers must delete the result after any database that is using the +// result has been closed. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); + +} + +#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h new file mode 100644 index 00000000..8340ad61 --- /dev/null +++ b/include/rocksdb/flush_block_policy.h @@ -0,0 +1,58 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include + +namespace rocksdb { + +class Slice; +class BlockBuilder; +struct Options; + +// FlushBlockPolicy provides a configurable way to determine when to flush a +// block in the block based tables, +class FlushBlockPolicy { + public: + // Keep track of the key/value sequences and return the boolean value to + // determine if table builder should flush current data block. + virtual bool Update(const Slice& key, + const Slice& value) = 0; + + virtual ~FlushBlockPolicy() { } +}; + +class FlushBlockPolicyFactory { + public: + // Return the name of the flush block policy. + virtual const char* Name() const = 0; + + // Return a new block flush policy that flushes data blocks by data size. + // FlushBlockPolicy may need to access the metadata of the data block + // builder to determine when to flush the blocks. + // + // Callers must delete the result after any database that is using the + // result has been closed. + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const Options& options, const BlockBuilder& data_block_builder) const = 0; + + virtual ~FlushBlockPolicyFactory() { } +}; + +class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { + public: + FlushBlockBySizePolicyFactory() {} + + virtual const char* Name() const override { + return "FlushBlockBySizePolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const Options& options, + const BlockBuilder& data_block_builder) const override; +}; + +} // rocksdb diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h new file mode 100644 index 00000000..7538e9cf --- /dev/null +++ b/include/rocksdb/iterator.h @@ -0,0 +1,106 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An iterator yields a sequence of key/value pairs from a source. +// The following class defines the interface. Multiple implementations +// are provided by this library. In particular, iterators are provided +// to access the contents of a Table or a DB. +// +// Multiple threads can invoke const methods on an Iterator without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Iterator must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +class Iterator { + public: + Iterator(); + virtual ~Iterator(); + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + virtual void Seek(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: !AtEnd() && !AtStart() + virtual Slice value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // Clients are allowed to register function/arg1/arg2 triples that + // will be invoked when this iterator is destroyed. + // + // Note that unlike all of the preceding methods, this method is + // not abstract and therefore clients should not override it. + typedef void (*CleanupFunction)(void* arg1, void* arg2); + void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); + + private: + struct Cleanup { + CleanupFunction function; + void* arg1; + void* arg2; + Cleanup* next; + }; + Cleanup cleanup_; + + // No copying allowed + Iterator(const Iterator&); + void operator=(const Iterator&); +}; + +// Return an empty iterator (yields nothing). +extern Iterator* NewEmptyIterator(); + +// Return an empty iterator with the specified status. +extern Iterator* NewErrorIterator(const Status& status); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h new file mode 100644 index 00000000..a46b6a75 --- /dev/null +++ b/include/rocksdb/ldb_tool.h @@ -0,0 +1,18 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H +#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H +#include "rocksdb/options.h" + +namespace rocksdb { + +class LDBTool { + public: + void Run(int argc, char** argv, Options = Options()); +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h new file mode 100644 index 00000000..05f1aebc --- /dev/null +++ b/include/rocksdb/memtablerep.h @@ -0,0 +1,230 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file contains the interface that must be implemented by any collection +// to be used as the backing store for a MemTable. Such a collection must +// satisfy the following properties: +// (1) It does not store duplicate items. +// (2) It uses MemTableRep::KeyComparator to compare items for iteration and +// equality. +// (3) It can be accessed concurrently by multiple readers and can support +// during reads. However, it needn't support multiple concurrent writes. +// (4) Items are never deleted. +// The liberal use of assertions is encouraged to enforce (1). +// +// The factory will be passed an Arena object when a new MemTableRep is +// requested. The API for this object is in rocksdb/arena.h. +// +// Users can implement their own memtable representations. We include three +// types built in: +// - SkipListRep: This is the default; it is backed by a skip list. +// - HashSkipListRep: The memtable rep that is best used for keys that are +// structured like "prefix:suffix" where iteration within a prefix is +// common and iteration across different prefixes is rare. It is backed by +// a hash map where each bucket is a skip list. +// - VectorRep: This is backed by an unordered std::vector. On iteration, the +// vector is sorted. It is intelligent about sorting; once the MarkReadOnly() +// has been called, the vector will only be sorted once. It is optimized for +// random-write-heavy workloads. +// +// The last four implementations are designed for situations in which +// iteration over the entire collection is rare since doing so requires all the +// keys to be copied into a sorted data structure. + +#pragma once + +#include +#include + +namespace rocksdb { + +class Arena; +class LookupKey; +class Slice; +class SliceTransform; + +typedef void* KeyHandle; + +class MemTableRep { + public: + // KeyComparator provides a means to compare keys, which are internal keys + // concatenated with values. + class KeyComparator { + public: + // Compare a and b. Return a negative value if a is less than b, 0 if they + // are equal, and a positive value if a is greater than b + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const = 0; + + virtual int operator()(const char* prefix_len_key, + const Slice& key) const = 0; + + virtual ~KeyComparator() { } + }; + + explicit MemTableRep(Arena* arena) : arena_(arena) {} + + // Allocate a buf of len size for storing key. The idea is that a specific + // memtable representation knows its underlying data structure better. By + // allowing it to allocate memory, it can possibly put correlated stuff + // in consecutive memory area to make processor prefetching more efficient. + virtual KeyHandle Allocate(const size_t len, char** buf); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert). + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + virtual void Insert(KeyHandle handle) = 0; + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const = 0; + + // Notify this table rep that it will no longer be added to. By default, does + // nothing. + virtual void MarkReadOnly() { } + + // Look up key from the mem table, since the first key in the mem table whose + // user_key matches the one given k, call the function callback_func(), with + // callback_args directly forwarded as the first parameter, and the mem table + // key as the second parameter. If the return value is false, then terminates. + // Otherwise, go through the next key. + // It's safe for Get() to terminate after having finished all the potential + // key for the k.user_key(), or not. + // + // Default: + // Get() function with a default value of dynamically construct an iterator, + // seek and call the call back function. + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)); + + // Report an approximation of how much memory has been used other than memory + // that was allocated through the arena. + virtual size_t ApproximateMemoryUsage() = 0; + + virtual ~MemTableRep() { } + + // Iteration over the contents of a skip collection + class Iterator { + public: + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() {} + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const = 0; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const = 0; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() = 0; + }; + + // Return an iterator over the keys in this representation. + virtual Iterator* GetIterator() = 0; + + // Return an iterator over at least the keys with the specified user key. The + // iterator may also allow access to other keys, but doesn't have to. Default: + // GetIterator(). + virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); } + + // Return an iterator over at least the keys with the specified prefix. The + // iterator may also allow access to other keys, but doesn't have to. Default: + // GetIterator(). + virtual Iterator* GetPrefixIterator(const Slice& prefix) { + return GetIterator(); + } + + // Return an iterator that has a special Seek semantics. The result of + // a Seek might only include keys with the same prefix as the target key. + virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); } + + protected: + // When *key is an internal key concatenated with the value, returns the + // user key. + virtual Slice UserKey(const char* key) const; + + Arena* arena_; +}; + +// This is the base class for all factories that are used by RocksDB to create +// new MemTableRep objects +class MemTableRepFactory { + public: + virtual ~MemTableRepFactory() {} + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, + Arena*, const SliceTransform*) = 0; + virtual const char* Name() const = 0; +}; + +// This creates MemTableReps that are backed by an std::vector. On iteration, +// the vector is sorted. This is useful for workloads where iteration is very +// rare and writes are generally not issued after reads begin. +// +// Parameters: +// count: Passed to the constructor of the underlying std::vector of each +// VectorRep. On initialization, the underlying array will be at least count +// bytes reserved for usage. +class VectorRepFactory : public MemTableRepFactory { + const size_t count_; + + public: + explicit VectorRepFactory(size_t count = 0) : count_(count) { } + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator&, Arena*, + const SliceTransform*) override; + virtual const char* Name() const override { + return "VectorRepFactory"; + } +}; + +// This uses a skip list to store keys. It is the default. +class SkipListFactory : public MemTableRepFactory { + public: + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator&, Arena*, + const SliceTransform*) override; + virtual const char* Name() const override { + return "SkipListFactory"; + } +}; + +// This class contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// bucket_count: number of fixed array buckets +// skiplist_height: the max height of the skiplist +// skiplist_branching_factor: probabilistic size ratio between adjacent +// link lists in the skiplist +extern MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count = 1000000, int32_t skiplist_height = 4, + int32_t skiplist_branching_factor = 4 +); + +// The factory is to create memtables with a hashed linked list: +// it contains a fixed array of buckets, each pointing to a sorted single +// linked list (null if the bucket is empty). +// bucket_count: number of fixed array buckets +extern MemTableRepFactory* NewHashLinkListRepFactory( + size_t bucket_count = 50000); + +} // namespace rocksdb diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h new file mode 100644 index 00000000..2ae64c1b --- /dev/null +++ b/include/rocksdb/merge_operator.h @@ -0,0 +1,182 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ + +#include +#include +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +class Slice; +class Logger; + +// The Merge Operator +// +// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// To use merge, the client needs to provide an object implementing one of +// the following interfaces: +// a) AssociativeMergeOperator - for most simple semantics (always take +// two values, and merge them into one value, which is then put back +// into rocksdb); numeric addition and string concatenation are examples; +// +// b) MergeOperator - the generic class for all the more abstract / complex +// operations; one method (FullMerge) to merge a Put/Delete value with a +// merge operand; and another method (PartialMerge) that merges multiple +// operands together. this is especially useful if your key values have +// complex structures but you would still like to support client-specific +// incremental updates. +// +// AssociativeMergeOperator is simpler to implement. MergeOperator is simply +// more powerful. +// +// Refer to rocksdb-merge wiki for more details and example implementations. +// +class MergeOperator { + public: + virtual ~MergeOperator() {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics + // existing: (IN) null indicates that the key does not exist before this op + // operand_list:(IN) the sequence of merge operations to apply, front() first. + // new_value:(OUT) Client is responsible for filling the merge result here + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + // + // Also make use of the *logger for error messages. + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const = 0; + + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a DB::Merge() call in the same order + // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. + // *new_value should be constructed such that a call to + // DB::Merge(key, *new_value) would yield the same result as a call + // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op). + // + // The default implementation of PartialMergeMulti will use this function + // as a helper, for backward compatibility. Any successor class of + // MergeOperator should either implement PartialMerge or PartialMergeMulti, + // although implementing PartialMergeMulti is suggested as it is in general + // more effective to merge multiple operands at a time instead of two + // operands at a time. + // + // If it is impossible or infeasible to combine the two operations, + // leave new_value unchanged and return false. The library will + // internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + // + // TODO: Presently there is no way to differentiate between error/corruption + // and simply "return false". For now, the client should simply return + // false in any case it cannot perform partial-merge, regardless of reason. + // If there is corruption in the data, handle it in the FullMerge() function, + // and return false there. The default implementation of PartialMerge will + // always return false. + virtual bool PartialMerge(const Slice& key, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* logger) const { + return false; + } + + // This function performs merge when all the operands are themselves merge + // operation types that you would have passed to a DB::Merge() call in the + // same order (front() first) + // (i.e. DB::Merge(key, operand_list[0]), followed by + // DB::Merge(key, operand_list[1]), ...) + // + // PartialMergeMulti should combine them into a single merge operation that is + // saved into *new_value, and then it should return true. *new_value should + // be constructed such that a call to DB::Merge(key, *new_value) would yield + // the same result as subquential individual calls to DB::Merge(key, operand) + // for each operand in operand_list from front() to back(). + // + // The PartialMergeMulti function will be called only when the list of + // operands are long enough. The minimum amount of operands that will be + // passed to the function are specified by the "min_partial_merge_operands" + // option. + // + // In the default implementation, PartialMergeMulti will invoke PartialMerge + // multiple times, where each time it only merges two operands. Developers + // should either implement PartialMergeMulti, or implement PartialMerge which + // is served as the helper function of the default PartialMergeMulti. + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + // The name of the MergeOperator. Used to check for MergeOperator + // mismatches (i.e., a DB created with one MergeOperator is + // accessed using a different MergeOperator) + // TODO: the name is currently not stored persistently and thus + // no checking is enforced. Client is responsible for providing + // consistent MergeOperator between DB opens. + virtual const char* Name() const = 0; +}; + +// The simpler, associative merge operator. +class AssociativeMergeOperator : public MergeOperator { + public: + virtual ~AssociativeMergeOperator() {} + + // Gives the client a way to express the read -> modify -> write semantics + // key: (IN) The key that's associated with this merge operation. + // existing_value:(IN) null indicates the key does not exist before this op + // value: (IN) the value to update/merge the existing_value with + // new_value: (OUT) Client is responsible for filling the merge result here + // logger: (IN) Client could use this to log errors during merge. + // + // Return true on success. + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. The client should assume that this will be treated + // as an error by the library. + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const = 0; + + + private: + // Default implementations of the MergeOperator functions + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override; +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h new file mode 100644 index 00000000..54b4ef38 --- /dev/null +++ b/include/rocksdb/options.h @@ -0,0 +1,876 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ +#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ + +#include +#include +#include +#include +#include + +#include "rocksdb/universal_compaction.h" + +namespace rocksdb { + +class Cache; +class CompactionFilter; +class CompactionFilterFactory; +class CompactionFilterFactoryV2; +class Comparator; +class Env; +enum InfoLogLevel : unsigned char; +class FilterPolicy; +class Logger; +class MergeOperator; +class Snapshot; +class TableFactory; +class MemTableRepFactory; +class TablePropertiesCollector; +class Slice; +class SliceTransform; +class Statistics; +class InternalKeyComparator; + +using std::shared_ptr; + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +enum CompressionType : char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2, + kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5 +}; + +enum CompactionStyle : char { + kCompactionStyleLevel = 0x0, // level based compaction style + kCompactionStyleUniversal = 0x1 // Universal compaction style +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + int window_bits; + int level; + int strategy; + CompressionOptions() : window_bits(-14), level(-1), strategy(0) {} + CompressionOptions(int wbits, int lev, int strategy) + : window_bits(wbits), level(lev), strategy(strategy) {} +}; + +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // REQUIRES: The client must provide a merge operator if Merge operation + // needs to be accessed. Calling Merge on a DB without a merge operator + // would result in Status::NotSupported. The client must ensure that the + // merge operator supplied here has the same name and *exactly* the same + // semantics as the merge operator provided to previous open calls on + // the same DB. The only exception is reserved for upgrade, where a DB + // previously without a merge operator is introduced to Merge operation + // for the first time. It's necessary to specify a merge operator when + // openning the DB in this case. + // Default: nullptr + shared_ptr merge_operator; + + // A single CompactionFilter instance to call into during compaction. + // Allows an application to modify/delete a key-value during background + // compaction. + // + // If the client requires a new compaction filter to be used for different + // compaction runs, it can specify compaction_filter_factory instead of this + // option. The client should specify only one of the two. + // compaction_filter takes precedence over compaction_filter_factory if + // client specifies both. + // + // If multithreaded compaction is being used, the supplied CompactionFilter + // instance may be used from different threads concurrently and so should be + // thread-safe. + // + // Default: nullptr + const CompactionFilter* compaction_filter; + + // This is a factory that provides compaction filter objects which allow + // an application to modify/delete a key-value during background compaction. + // + // A new filter will be created on each compaction run. If multithreaded + // compaction is being used, each created CompactionFilter will only be used + // from a single thread and so does not need to be thread-safe. + // + // Default: a factory that doesn't provide any object + std::shared_ptr compaction_filter_factory; + + // Version TWO of the compaction_filter_factory + // It supports rolling compaction + // + // Default: a factory that doesn't provide any object + std::shared_ptr compaction_filter_factory_v2; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // If any of the writes to the database fails (Put, Delete, Merge, Write), + // the database will switch to read-only mode and fail all other + // Write operations. + // Default: true + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + shared_ptr info_log; + + InfoLogLevel info_log_level; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to max_write_buffer_number write buffers may be held in memory + // at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // The maximum number of write buffers that are built up in memory. + // The default is 2, so that when 1 write buffer is being flushed to + // storage, new writes can continue to the other write buffer. + // Default: 2 + int max_write_buffer_number; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are fushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. Default: 1 + int min_write_buffer_number_to_merge; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. + // + // Default: 5000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL use the specified cache for blocks. + // If NULL, rocksdb will automatically create and use an 8MB internal cache. + // Default: nullptr + shared_ptr block_cache; + + // If non-NULL use the specified cache for compressed blocks. + // If NULL, rocksdb will not use a compressed block cache. + // Default: nullptr + shared_ptr block_cache_compressed; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to quick compression + // algorithm while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non nullptr, should have an entry for + // each level of the database. This array, if non nullptr, overides the + // value specified in the previous field 'compression'. The caller is + // reponsible for allocating memory and initializing the values in it + // before invoking Open(). The caller is responsible for freeing this + // array and it could be freed anytime after the return from Open(). + // This could have been a std::vector but that makes the equivalent + // java/C api hard to construct. + std::vector compression_per_level; + + // different options for compression algorithms + CompressionOptions compression_opts; + + // If non-nullptr, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: nullptr + const FilterPolicy* filter_policy; + + // If non-nullptr, use the specified function to determine the + // prefixes for keys. These prefixes will be placed in the filter. + // Depending on the workload, this can reduce the number of read-IOP + // cost for scans when a prefix is passed via ReadOptions to + // db.NewIterator(). For prefix filtering to work properly, + // "prefix_extractor" and "comparator" must be such that the following + // properties hold: + // + // 1) key.starts_with(prefix(key)) + // 2) Compare(prefix(key), key) <= 0. + // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 + // 4) prefix(prefix(key)) == prefix(key) + // + // Default: nullptr + std::shared_ptr prefix_extractor; + + // If true, place whole keys in the filter (not just prefixes). + // This must generally be true for gets to be efficient. + // + // Default: true + bool whole_key_filtering; + + // Number of levels for this database + int num_levels; + + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + int level0_file_num_compaction_trigger; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + int level0_slowdown_writes_trigger; + + // Maximum number of level-0 files. We stop writes at this point. + int level0_stop_writes_trigger; + + // Maximum level to which a new compacted memtable is pushed if it + // does not create overlap. We try to push to level 2 to avoid the + // relatively expensive level 0=>1 compactions and to avoid some + // expensive manifest file operations. We do not push all the way to + // the largest level since that can generate a lot of wasted disk + // space if the same key space is being repeatedly overwritten. + int max_mem_compaction_level; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + + // by default target_file_size_base is 2MB. + int target_file_size_base; + // by default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + int target_file_size_multiplier; + + // Control maximum total data size for a level. + // max_bytes_for_level_base is the max total for level-1. + // Maximum number of bytes for level L can be calculated as + // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) + // For example, if max_bytes_for_level_base is 20MB, and if + // max_bytes_for_level_multiplier is 10, total data size for level-1 + // will be 20MB, total file size for level-2 will be 200MB, + // and total file size for level-3 will be 2GB. + + // by default 'max_bytes_for_level_base' is 10MB. + uint64_t max_bytes_for_level_base; + // by default 'max_bytes_for_level_base' is 10. + int max_bytes_for_level_multiplier; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // Default: 1 + std::vector max_bytes_for_level_multiplier_additional; + + // Maximum number of bytes in all compacted files. We avoid expanding + // the lower level file set of a compaction if it would make the + // total compaction cover more than + // (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + int expanded_compaction_factor; + + // Maximum number of bytes in all source files to be compacted in a + // single compaction run. We avoid picking too many files in the + // source level so that we do not exceed the total source bytes + // for compaction to exceed + // (source_compaction_factor * targetFileSizeLevel()) many bytes. + // Default:1, i.e. pick maxfilesize amount of data as the source of + // a compaction. + int source_compaction_factor; + + // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + // stop building a single file in a level->level+1 compaction. + int max_grandparent_overlap_factor; + + // If non-null, then we should collect metrics about database operations + // Statistics objects should not be shared between DB instances as + // it does not use any locks to prevent concurrent updates. + shared_ptr statistics; + + // If true, then the contents of data files are not synced + // to stable storage. Their contents remain in the OS buffers till the + // OS decides to flush them. This option is good for bulk-loading + // of data. Once the bulk-loading is complete, please issue a + // sync to the OS to flush all dirty buffesrs to stable storage. + // Default: false + bool disableDataSync; + + // If true, then every store to stable storage will issue a fsync. + // If false, then every store to stable storage will issue a fdatasync. + // This parameter should be set to true while storing data to + // filesystem like ext3 that can lose files after a reboot. + // Default: false + bool use_fsync; + + // This number controls how often a new scribe log about + // db deploy stats is written out. + // -1 indicates no logging at all. + // Default value is 1800 (half an hour). + int db_stats_log_interval; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir; + + // Disable compaction triggered by seek. + // With bloomfilter and fast storage, a miss on one level + // is very cheap if the file handle is cached in table cache + // (which is true if max_open_files is large). + bool disable_seek_compaction; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + uint64_t delete_obsolete_files_period_micros; + + // Maximum number of concurrent background jobs, submitted to + // the default LOW priority thread pool + // Default: 1 + int max_background_compactions; + + // Maximum number of concurrent background memtable flush jobs, submitted to + // the HIGH priority thread pool. + // By default, all background jobs (major compaction and memtable flush) go + // to the LOW priority pool. If this option is set to a positive number, + // memtable flush jobs will be submitted to the HIGH priority pool. + // It is important when the same Env is shared by multiple db instances. + // Without a separate pool, long running major compaction jobs could + // potentially block memtable flush jobs of other db instances, leading to + // unnecessary Put stalls. + // Default: 1 + int max_background_flushes; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + size_t log_file_time_to_roll; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num; + + // Puts are delayed 0-1 ms when any level has a compaction score that exceeds + // soft_rate_limit. This is ignored when == 0.0. + // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not + // hold, RocksDB will set soft_rate_limit = hard_rate_limit + // Default: 0 (disabled) + double soft_rate_limit; + + // Puts are delayed 1ms at a time when any level has a compaction score that + // exceeds hard_rate_limit. This is ignored when <= 1.0. + // Default: 0 (disabled) + double hard_rate_limit; + + // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then + // there is no limit. + // Default: 1000 + unsigned int rate_limit_delay_max_milliseconds; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is MAX_INT so that roll-over does not take place. + uint64_t max_manifest_file_size; + + // Disable block cache. If this is set to true, + // then no block cache should be used, and the block_cache should + // point to a nullptr object. + // Default: false + bool no_block_cache; + + // Number of shards used for table cache. + int table_cache_numshardbits; + + // During data eviction of table's LRU cache, it would be inefficient + // to strictly follow LRU because this piece of memory will not really + // be released unless its refcount falls to zero. Instead, make two + // passes: the first pass will release items with refcount = 1, + // and if not enough space releases after scanning the number of + // elements specified by this parameter, we will remove items in LRU + // order. + int table_cache_remove_scan_count_limit; + + // Size of one block in arena memory allocation. + // + // If <= 0, a proper value is automatically calculated (usually about 1/10 of + // writer_buffer_size). + // + // There are two additonal restriction of the The specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + size_t arena_block_size; + + // Create an Options object with default values for all fields. + Options(); + + void Dump(Logger* log) const; + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); + + // Disable automatic compactions. Manual compactions can still + // be issued on this database. + bool disable_auto_compactions; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_secondsi / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size; + + // Purge duplicate/deleted keys when a memtable is flushed to storage. + // Default: true + bool purge_redundant_kvs_while_flush; + + // Data being read from file storage may be buffered in the OS + // Default: true + bool allow_os_buffer; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: false + bool allow_mmap_writes; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec; + + // Skip log corruption error on recovery (If client is ok with + // losing most recent changes) + // Default: false + bool skip_log_error_on_recovery; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // Default: 3600 (1 hour) + unsigned int stats_dump_period_sec; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + // Default is 10. + int block_size_deviation; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + } access_hint_on_compaction_start; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style; + + // The options needed to support Universal Style compactions + CompactionOptionsUniversal compaction_options_universal; + + // If true, compaction will verify checksum on every read that happens + // as part of compaction + // Default: true + bool verify_checksums_in_compaction; + + // Use KeyMayExist API to filter deletes when this is true. + // If KeyMayExist returns false, i.e. the key definitely does not exist, then + // the delete is a noop. KeyMayExist only incurs in-memory look up. + // This optimization avoids writing the delete to storage when appropriate. + // Default: false + bool filter_deletes; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // Default: 8 + uint64_t max_sequential_skip_in_iterations; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr memtable_factory; + + // This is a factory that provides TableFactory objects. + // Default: a factory that provides a default implementation of + // Table and TableBuilder. + std::shared_ptr table_factory; + + // This option allows user to to collect their own interested statistics of + // the tables. + // Default: emtpy vector -- no user-defined statistics collection will be + // performed. + typedef std::vector> + TablePropertiesCollectors; + TablePropertiesCollectors table_properties_collectors; + + // Allows thread-safe inplace updates. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. + // Default: false. + bool inplace_update_support; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + size_t inplace_update_num_locks; + + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + + // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom + // for memtable + uint32_t memtable_prefix_bloom_bits; + + // number of hash probes per key + uint32_t memtable_prefix_bloom_probes; + + // Control locality of bloom filter probes to improve cache miss rate. + // This option only applies to memtable prefix bloom and plaintable + // prefix bloom. It essentially limits the max number of cache lines each + // bloom filter check can touch. + // This optimization is turned off when set to 0. The number should never + // be greater than number of probes. This option can boost performance + // for in-memory workload but should use with care since it can cause + // higher false positive rate. + // Default: 0 + uint32_t bloom_locality; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + size_t max_successive_merges; + + // The number of partial merge operands to accumulate before partial + // merge will be performed. Partial merge will not be called + // if the list of values to merge is less than min_partial_merge_operands. + // + // If min_partial_merge_operands < 2, then it will be treated as 2. + // + // Default: 2 + uint32_t min_partial_merge_operands; + + // Allow RocksDB to use thread local storage to optimize performance. + // Default: true + bool allow_thread_local; +}; + +// +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1 // data in memtable or block cache +}; + +// Options that control read operations +struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums; + + // Should the "data block"/"index block"/"filter block" read for this + // iteration be cached in memory? + // Callers may wish to set this field to false for bulk scans. + // Default: true + bool fill_cache; + + // If this option is set and memtable implementation allows, Seek + // might only return keys with the same prefix as the seek-key + bool prefix_seek; + + // If "snapshot" is non-nullptr, read as of the supplied snapshot + // (which must belong to the DB that is being read and which must + // not have been released). If "snapshot" is nullptr, use an impliicit + // snapshot of the state at the beginning of this read operation. + // Default: nullptr + const Snapshot* snapshot; + + // If "prefix" is non-nullptr, and ReadOptions is being passed to + // db.NewIterator, only return results when the key begins with this + // prefix. This field is ignored by other calls (e.g., Get). + // Options.prefix_extractor must also be set, and + // prefix_extractor.InRange(prefix) must be true. The iterator + // returned by NewIterator when this option is set will behave just + // as if the underlying store did not contain any non-matching keys, + // with two exceptions. Seek() only accepts keys starting with the + // prefix, and SeekToLast() is not supported. prefix filter with this + // option will sometimes reduce the number of read IOPs. + // Default: nullptr + const Slice* prefix; + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + // Default: kReadAllTier + ReadTier read_tier; + + // Specify to create a tailing iterator -- a special iterator that has a + // view of the complete database (i.e. it can also be used to read newly + // added data) and is optimized for sequential reads. It will return records + // that were inserted into the database after the creation of the iterator. + // Default: false + bool tailing; + + ReadOptions() + : verify_checksums(true), + fill_cache(true), + prefix_seek(false), + snapshot(nullptr), + prefix(nullptr), + read_tier(kReadAllTier), + tailing(false) {} + ReadOptions(bool cksum, bool cache) + : verify_checksums(cksum), + fill_cache(cache), + prefix_seek(false), + snapshot(nullptr), + prefix(nullptr), + read_tier(kReadAllTier), + tailing(false) {} +}; + +// Options that control write operations +struct WriteOptions { + // If true, the write will be flushed from the operating system + // buffer cache (by calling WritableFile::Sync()) before the write + // is considered complete. If this flag is true, writes will be + // slower. + // + // If this flag is false, and the machine crashes, some recent + // writes may be lost. Note that if it is just the process that + // crashes (i.e., the machine does not reboot), no writes will be + // lost even if sync==false. + // + // In other words, a DB write with sync==false has similar + // crash semantics as the "write()" system call. A DB write + // with sync==true has similar crash semantics to a "write()" + // system call followed by "fdatasync()". + // + // Default: false + bool sync; + + // If true, writes will not first go to the write ahead log, + // and the write may got lost after a crash. + bool disableWAL; + + WriteOptions() : sync(false), disableWAL(false) {} +}; + +// Options that control flush operations +struct FlushOptions { + // If true, the flush will wait until the flush is done. + // Default: true + bool wait; + + FlushOptions() : wait(true) {} +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h new file mode 100644 index 00000000..63eddb61 --- /dev/null +++ b/include/rocksdb/perf_context.h @@ -0,0 +1,75 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H +#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H + +#include +#include + +namespace rocksdb { + +enum PerfLevel { + kDisable = 0, // disable perf stats + kEnableCount = 1, // enable only count stats + kEnableTime = 2 // enable time stats too +}; + +// set the perf stats level +void SetPerfLevel(PerfLevel level); + +// A thread local context for gathering performance counter efficiently +// and transparently. + +struct PerfContext { + + void Reset(); // reset all performance counters to zero + + std::string ToString() const; + + uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; // total number of block cache hits + uint64_t block_read_count; // total number of block reads (with IO) + uint64_t block_read_byte; // total number of bytes from block reads + uint64_t block_read_time; // total time spent on block reads + uint64_t block_checksum_time; // total time spent on block checksum + uint64_t block_decompress_time; // total time spent on block decompression + // total number of internal keys skipped over during iteration (overwritten or + // deleted, to be more specific, hidden by a put or delete of the same key) + uint64_t internal_key_skipped_count; + // total number of deletes skipped over during iteration + uint64_t internal_delete_skipped_count; + + uint64_t get_snapshot_time; // total time spent on getting snapshot + uint64_t get_from_memtable_time; // total time spent on querying memtables + uint64_t get_from_memtable_count; // number of mem tables queried + // total time spent after Get() finds a key + uint64_t get_post_process_time; + uint64_t get_from_output_files_time; // total time reading from output files + // total time spent on seeking child iters + uint64_t seek_child_seek_time; + // number of seek issued in child iterators + uint64_t seek_child_seek_count; + uint64_t seek_min_heap_time; // total time spent on the merge heap + // total time spent on seeking the internal entries + uint64_t seek_internal_seek_time; + // total time spent on iterating internal entries to find the next user entry + uint64_t find_next_user_entry_time; + // total time spent on pre or post processing when writing a record + uint64_t write_pre_and_post_process_time; + uint64_t write_wal_time; // total time spent on writing to WAL + // total time spent on writing to mem tables + uint64_t write_memtable_time; +}; + +#if defined(IOS_CROSS_COMPILE) +extern PerfContext perf_context; +#else +extern __thread PerfContext perf_context; +#endif + +} + +#endif diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h new file mode 100644 index 00000000..e6cca213 --- /dev/null +++ b/include/rocksdb/slice.h @@ -0,0 +1,136 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_ +#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_ + +#include +#include +#include +#include + +namespace rocksdb { + +class Slice { + public: + // Create an empty slice. + Slice() : data_(""), size_(0) { } + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : data_(d), size_(n) { } + + // Create a slice that refers to the contents of "s" + /* implicit */ + Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + /* implicit */ + Slice(const char* s) : data_(s), size_(strlen(s)) { } + + // Return a pointer to the beginning of the referenced data + const char* data() const { return data_; } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + char operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { data_ = ""; size_ = 0; } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Return a string that contains the copy of the referenced data. + std::string ToString(bool hex = false) const { + if (hex) { + std::string result; + char buf[10]; + for (size_t i = 0; i < size_; i++) { + snprintf(buf, 10, "%02X", (unsigned char)data_[i]); + result += buf; + } + return result; + } else { + return std::string(data_, size_); + } + } + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (memcmp(data_, x.data_, x.size_) == 0)); + } + + // private: make these public for rocksdbjni access + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +// A set of Slices that are virtually concatenated together. 'parts' points +// to an array of Slices. The number of elements in the array is 'num_parts'. +struct SliceParts { + SliceParts(const Slice* parts, int num_parts) : + parts(parts), num_parts(num_parts) { } + + const Slice* parts; + int num_parts; +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (memcmp(x.data(), y.data(), x.size()) == 0)); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = memcmp(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_ diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h new file mode 100644 index 00000000..a7845500 --- /dev/null +++ b/include/rocksdb/slice_transform.h @@ -0,0 +1,47 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Class for specifying user-defined functions which perform a +// transformation on a slice. It is not required that every slice +// belong to the domain and/or range of a function. Subclasses should +// define InDomain and InRange to determine which slices are in either +// of these sets respectively. + +#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ +#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ + +#include + +namespace rocksdb { + +class Slice; + +class SliceTransform { + public: + virtual ~SliceTransform() {}; + + // Return the name of this transformation. + virtual const char* Name() const = 0; + + // transform a src in domain to a dst in the range + virtual Slice Transform(const Slice& src) const = 0; + + // determine whether this is a valid src upon the function applies + virtual bool InDomain(const Slice& src) const = 0; + + // determine whether dst=Transform(src) for some src + virtual bool InRange(const Slice& dst) const = 0; +}; + +extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); + +extern const SliceTransform* NewNoopTransform(); + +} + +#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h new file mode 100644 index 00000000..dcd82f66 --- /dev/null +++ b/include/rocksdb/statistics.h @@ -0,0 +1,268 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ +#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +/** + * Keep adding ticker's here. + * 1. Any ticker should be added before TICKER_ENUM_MAX. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. + */ +enum Tickers { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of times bloom filter has avoided file reads. + BLOOM_FILTER_USEFUL, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 3 reasons currently. + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // Bytes written / read + BYTES_WRITTEN, + BYTES_READ, + NO_FILE_CLOSES, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS, + // Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS, + // write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS, + RATE_LIMIT_DELAY_MILLIS, + NO_ITERATORS, // number of iterators currently open + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES, + NUMBER_MERGE_FAILURES, + SEQUENCE_NUMBER, + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED, + BLOOM_FILTER_PREFIX_USEFUL, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpadtesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + NUMBER_SUPERVERSION_ACQUIRES, + NUMBER_SUPERVERSION_RELEASES, + NUMBER_SUPERVERSION_CLEANUPS, + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +const std::vector> TickersNameMap = { + {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"}, + {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"}, + {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"}, + {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"}, + {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, + {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, + {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, + {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, + {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, + {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, + {MEMTABLE_HIT, "rocksdb.memtable.hit"}, + {MEMTABLE_MISS, "rocksdb.memtable.miss"}, + {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"}, + {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"}, + {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"}, + {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"}, + {NUMBER_KEYS_READ, "rocksdb.number.keys.read"}, + {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"}, + {BYTES_WRITTEN, "rocksdb.bytes.written"}, + {BYTES_READ, "rocksdb.bytes.read"}, + {NO_FILE_CLOSES, "rocksdb.no.file.closes"}, + {NO_FILE_OPENS, "rocksdb.no.file.opens"}, + {NO_FILE_ERRORS, "rocksdb.no.file.errors"}, + {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"}, + {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"}, + {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"}, + {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"}, + {NO_ITERATORS, "rocksdb.num.iterators"}, + {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, + {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"}, + {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"}, + {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"}, + {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"}, + {SEQUENCE_NUMBER, "rocksdb.sequence.number"}, + {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, + {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, + {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, + {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, + {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"}, + {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"}, + {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, + {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, + {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, + {WRITE_DONE_BY_OTHER, "rocksdb.write.other"}, + {WRITE_WITH_WAL, "rocksdb.write.wal"}, + {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, + {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, + {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + "rocksdb.number.direct.load.table.properties"}, + {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"}, + {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"}, + {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"}, +}; + +/** + * Keep adding histogram's here. + * Any histogram whould have value less than HISTOGRAM_ENUM_MAX + * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX + * Add a string representation in HistogramsNameMap below + * And increment HISTOGRAM_ENUM_MAX + */ +enum Histograms { + DB_GET, + DB_WRITE, + COMPACTION_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + + STALL_L0_SLOWDOWN_COUNT, + STALL_MEMTABLE_COMPACTION_COUNT, + STALL_L0_NUM_FILES_COUNT, + HARD_RATE_LIMIT_DELAY_COUNT, + SOFT_RATE_LIMIT_DELAY_COUNT, + NUM_FILES_IN_SINGLE_COMPACTION, + HISTOGRAM_ENUM_MAX, +}; + +const std::vector> HistogramsNameMap = { + { DB_GET, "rocksdb.db.get.micros" }, + { DB_WRITE, "rocksdb.db.write.micros" }, + { COMPACTION_TIME, "rocksdb.compaction.times.micros" }, + { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" }, + { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" }, + { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" }, + { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" }, + { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" }, + { DB_MULTIGET, "rocksdb.db.multiget.micros" }, + { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" }, + { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" }, + { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" }, + { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"}, + { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"}, + { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"}, + { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"}, + { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"}, + { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" }, +}; + +struct HistogramData { + double median; + double percentile95; + double percentile99; + double average; + double standard_deviation; +}; + +// Analyze the performance of a db +class Statistics { + public: + virtual ~Statistics() {} + + virtual long getTickerCount(Tickers tickerType) = 0; + virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0; + virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0; + virtual void measureTime(Histograms histogramType, uint64_t time) = 0; + + virtual void histogramData(Histograms type, HistogramData* const data) = 0; + // String representation of the statistic object. + std::string ToString(); +}; + +// Create a concrete DBStatistics object +std::shared_ptr CreateDBStatistics(); + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h new file mode 100644 index 00000000..dbd41fc9 --- /dev/null +++ b/include/rocksdb/status.h @@ -0,0 +1,145 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_ +#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_ + +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +class Status { + public: + // Create a success status. + Status() : code_(kOk), state_(nullptr) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotFound, msg, msg2); + } + // Fast path for not found without malloc; + static Status NotFound() { + return Status(kNotFound); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kCorruption, msg, msg2); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kNotSupported, msg, msg2); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, msg, msg2); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIOError, msg, msg2); + } + static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kMergeInProgress, msg, msg2); + } + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } + static Status ShutdownInProgress(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kShutdownInProgress, msg, msg2); + } + + // Returns true iff the status indicates success. + bool ok() const { return code() == kOk; } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { return code() == kNotSupported; } + + // Returns true iff the status indicates an InvalidArgument error. + bool IsInvalidArgument() const { return code() == kInvalidArgument; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + + // Returns true iff the status indicates an MergeInProgress. + bool IsMergeInProgress() const { return code() == kMergeInProgress; } + + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { return code() == kIncomplete; } + + // Returns true iff the status indicates Incomplete + bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + private: + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kMergeInProgress = 6, + kIncomplete = 7, + kShutdownInProgress = 8 + }; + + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + const char* state_; + + Code code() const { + return code_; + } + explicit Status(Code code) : code_(code), state_(nullptr) { } + Status(Code code, const Slice& msg, const Slice& msg2); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + code_ = s.code_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); +} +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + code_ = s.code_; + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); + } +} + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_ diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h new file mode 100644 index 00000000..1016bcf1 --- /dev/null +++ b/include/rocksdb/table.h @@ -0,0 +1,181 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Currently we support two types of tables: plain table and block-based table. +// 1. Block-based table: this is the default table type that we inherited from +// LevelDB, which was designed for storing data in hard disk or flash +// device. +// 2. Plain table: it is one of RocksDB's SST file format optimized +// for low query latency on pure-memory or really low-latency media. +// +// A tutorial of rocksdb table formats is available here: +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats +// +// Example code is also available +// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples + +#pragma once +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// -- Block-based Table +class FlushBlockPolicyFactory; +class RandomAccessFile; +class TableBuilder; +class TableReader; +class WritableFile; +struct EnvOptions; +struct Options; + +using std::unique_ptr; + +// For advanced user only +struct BlockBasedTableOptions { + // @flush_block_policy_factory creates the instances of flush block policy. + // which provides a configurable way to determine when to flush a block in + // the block based tables. If not set, table builder will use the default + // block flush policy, which cut blocks by block size (please refer to + // `FlushBlockBySizePolicy`). + std::shared_ptr flush_block_policy_factory; + + // TODO(kailiu) Temporarily disable this feature by making the default value + // to be false. + // + // Indicating if we'd put index/filter blocks to the block cache. + // If not specified, each "table reader" object will pre-load index/filter + // block during table initialization. + bool cache_index_and_filter_blocks = false; + + // The index type that will be used for this table. + enum IndexType : char { + // A space efficient index block that is optimized for + // binary-search-based index. + kBinarySearch, + + // The hash index, if enabled, will do the hash lookup when + // `ReadOption.prefix_seek == true`. User should also specify + // `Options.prefix_extractor` to allow the index block to correctly + // extract the prefix of the given key and perform hash table lookup. + kHashSearch, + }; + + IndexType index_type = kBinarySearch; +}; + +// Table Properties that are specific to block-based table properties. +struct BlockBasedTablePropertyNames { + // value of this propertis is a fixed int32 number. + static const std::string kIndexType; +}; + +// Create default block based table factory. +extern TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + +// -- Plain Table with prefix-only seek +// For this factory, you need to set Options.prefix_extrator properly to make it +// work. Look-up will starts with prefix hash lookup for key prefix. Inside the +// hash bucket found, a binary search is executed for hash conflicts. Finally, +// a linear search is used. +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You +// may disable it by passing a zero. +// @hash_table_ratio: the desired utilization of the hash table used for prefix +// hashing. hash_table_ratio = number of prefixes / #buckets +// in the hash table +// @index_sparseness: inside each prefix, need to build one index record for how +// many keys for binary search inside each hash bucket. +const uint32_t kPlainTableVariableLength = 0; +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len = + kPlainTableVariableLength, + int bloom_bits_per_prefix = 10, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16); + +// -- Plain Table +// This factory of plain table ignores Options.prefix_extractor and assumes no +// hashable prefix available to the key structure. Lookup will be based on +// binary search index only. Total order seek() can be issued. +// @user_key_len: plain table has optimization for fix-sized keys, which can be +// specified via user_key_len. Alternatively, you can pass +// `kPlainTableVariableLength` if your keys have variable +// lengths. +// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may +// disable it by passing a zero. +// @index_sparseness: need to build one index record for how many keys for +// binary search. +extern TableFactory* NewTotalOrderPlainTableFactory( + uint32_t user_key_len = kPlainTableVariableLength, + int bloom_bits_per_key = 0, size_t index_sparseness = 16); + +// A base class for table factories. +class TableFactory { + public: + virtual ~TableFactory() {} + + // The type of the table. + // + // The client of this package should switch to a new name whenever + // the table format implementation changes. + // + // Names starting with "rocksdb." are reserved and should not be used + // by any clients of this package. + virtual const char* Name() const = 0; + + // Returns a Table object table that can fetch data from file specified + // in parameter file. It's the caller's responsibility to make sure + // file is in the correct format. + // + // NewTableReader() is called in two places: + // (1) TableCache::FindTable() calls the function when table cache miss + // and cache the table object returned. + // (1) SstFileReader (for SST Dump) opens the table and dump the table + // contents using the interator of the table. + // options and soptions are options. options is the general options. + // Multiple configured can be accessed from there, including and not + // limited to block cache and key comparators. + // file is a file handler to handle the file for the table + // file_size is the physical file size of the file + // table_reader is the output table reader + virtual Status NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const = 0; + + // Return a table builder to write to a file for this table type. + // + // It is called in several places: + // (1) When flushing memtable to a level-0 output file, it creates a table + // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) + // (2) During compaction, it gets the builder for writing compaction output + // files in DBImpl::OpenCompactionOutputFile(). + // (3) When recovering from transaction logs, it creates a table builder to + // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, + // by calling BuildTable()) + // (4) When running Repairer, it creates a table builder to convert logs to + // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) + // + // options is the general options. Multiple configured can be acceseed from + // there, including and not limited to compression options. + // file is a handle of a writable file. It is the caller's responsibility to + // keep the file open and close the file after closing the table builder. + // compression_type is the compression type to use in this table. + virtual TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const = 0; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h new file mode 100644 index 00000000..55b83f44 --- /dev/null +++ b/include/rocksdb/table_properties.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +// -- Table Properties +// Other than basic table properties, each table may also have the user +// collected properties. +// The value of the user-collected properties are encoded as raw bytes -- +// users have to interprete these values by themselves. +// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do +// something similar to: +// +// UserCollectedProperties props = ...; +// for (auto pos = props.lower_bound(prefix); +// pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0; +// ++pos) { +// ... +// } +typedef std::map UserCollectedProperties; + +// TableProperties contains a bunch of read-only properties of its associated +// table. +struct TableProperties { + public: + // the total size of all data blocks. + uint64_t data_size = 0; + // the size of index block. + uint64_t index_size = 0; + // the size of filter block. + uint64_t filter_size = 0; + // total raw key size + uint64_t raw_key_size = 0; + // total raw value size + uint64_t raw_value_size = 0; + // the number of blocks in this table + uint64_t num_data_blocks = 0; + // the number of entries in this table + uint64_t num_entries = 0; + // format version, reserved for backward compatibility + uint64_t format_version = 0; + // If 0, key is variable length. Otherwise number of bytes for each key. + uint64_t fixed_key_len = 0; + + // The name of the filter policy used in this table. + // If no filter policy is used, `filter_policy_name` will be an empty string. + std::string filter_policy_name; + + // user collected properties + UserCollectedProperties user_collected_properties; + + // convert this object to a human readable form + // @prop_delim: delimiter for each property. + std::string ToString(const std::string& prop_delim = "; ", + const std::string& kv_delim = "=") const; +}; + +// table properties' human-readable names in the property block. +struct TablePropertiesNames { + static const std::string kDataSize; + static const std::string kIndexSize; + static const std::string kFilterSize; + static const std::string kRawKeySize; + static const std::string kRawValueSize; + static const std::string kNumDataBlocks; + static const std::string kNumEntries; + static const std::string kFormatVersion; + static const std::string kFixedKeyLen; + static const std::string kFilterPolicy; +}; + +extern const std::string kPropertiesBlock; + +// `TablePropertiesCollector` provides the mechanism for users to collect +// their own interested properties. This class is essentially a collection +// of callback functions that will be invoked during table building. +class TablePropertiesCollector { + public: + virtual ~TablePropertiesCollector() {} + + // Add() will be called when a new key/value pair is inserted into the table. + // @params key the original key that is inserted into the table. + // @params value the original value that is inserted into the table. + virtual Status Add(const Slice& key, const Slice& value) = 0; + + // Finish() will be called when a table has already been built and is ready + // for writing the properties block. + // @params properties User will add their collected statistics to + // `properties`. + virtual Status Finish(UserCollectedProperties* properties) = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; + + // Return the human-readable properties, where the key is property name and + // the value is the human-readable form of value. + virtual UserCollectedProperties GetReadableProperties() const = 0; +}; + +// Extra properties +// Below is a list of non-basic properties that are collected by database +// itself. Especially some properties regarding to the internal keys (which +// is unknown to `table`). +extern uint64_t GetDeletedKeys(const UserCollectedProperties& props); + +} // namespace rocksdb diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h new file mode 100644 index 00000000..30443bba --- /dev/null +++ b/include/rocksdb/transaction_log.h @@ -0,0 +1,104 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ +#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ + +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include +#include + +namespace rocksdb { + +class LogFile; +typedef std::vector> VectorLogPtr; + +enum WalFileType { + /* Indicates that WAL file is in archive directory. WAL files are moved from + * the main db directory to archive directory once they are not live and stay + * there until cleaned up. Files are cleaned depending on archive size + * (Options::WAL_size_limit_MB) and time since last cleaning + * (Options::WAL_ttl_seconds). + */ + kArchivedLogFile = 0, + + /* Indicates that WAL file is live and resides in the main db directory */ + kAliveLogFile = 1 +} ; + +class LogFile { + public: + LogFile() {} + virtual ~LogFile() {} + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = /000003.log + // For an archived-log-file = /archive/000003.log + virtual std::string PathName() const = 0; + + + // Primary identifier for log file. + // This is directly proportional to creation time of the log file + virtual uint64_t LogNumber() const = 0; + + // Log file can be either alive or archived + virtual WalFileType Type() const = 0; + + // Starting sequence number of writebatch written in this log file + virtual SequenceNumber StartSequence() const = 0; + + // Size of log file on disk in Bytes + virtual uint64_t SizeFileBytes() const = 0; +}; + +struct BatchResult { + SequenceNumber sequence = 0; + std::unique_ptr writeBatchPtr; +}; + +// A TransactionLogIterator is used to iterate over the transactions in a db. +// One run of the iterator is continuous, i.e. the iterator will stop at the +// beginning of any gap in sequences +class TransactionLogIterator { + public: + TransactionLogIterator() {} + virtual ~TransactionLogIterator() {} + + // An iterator is either positioned at a WriteBatch or not valid. + // This method returns true if the iterator is valid. + // Can read data from a valid iterator. + virtual bool Valid() = 0; + + // Moves the iterator to the next WriteBatch. + // REQUIRES: Valid() to be true. + virtual void Next() = 0; + + // Returns ok if the iterator is valid. + // Returns the Error when something has gone wrong. + virtual Status status() = 0; + + // If valid return's the current write_batch and the sequence number of the + // earliest transaction contained in the batch. + // ONLY use if Valid() is true and status() is OK. + virtual BatchResult GetBatch() = 0; + + // The read options for TransactionLogIterator. + struct ReadOptions { + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + // Default: true + bool verify_checksums_; + + ReadOptions() : verify_checksums_(true) {} + + explicit ReadOptions(bool verify_checksums) + : verify_checksums_(verify_checksums) {} + }; +}; +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h new file mode 100644 index 00000000..f20bf827 --- /dev/null +++ b/include/rocksdb/types.h @@ -0,0 +1,20 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_ +#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_ + +#include + +namespace rocksdb { + +// Define all public custom types here. + +// Represents a sequence number in a WAL file. +typedef uint64_t SequenceNumber; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_ diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h new file mode 100644 index 00000000..eaf47e5c --- /dev/null +++ b/include/rocksdb/universal_compaction.h @@ -0,0 +1,83 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H +#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H + +#include +#include + +namespace rocksdb { + +// +// Algorithm used to make a compaction request stop picking new files +// into a single compaction run +// +enum CompactionStopStyle { + kCompactionStopStyleSimilarSize, // pick files of similar size + kCompactionStopStyleTotalSize // total size of picked files > next file +}; + +class CompactionOptionsUniversal { + public: + + // Percentage flexibilty while comparing file size. If the candidate file(s) + // size is 1% smaller than the next file's size, then include next file into + // this candidate set. // Default: 1 + unsigned int size_ratio; + + // The minimum number of files in a single compaction run. Default: 2 + unsigned int min_merge_width; + + // The maximum number of files in a single compaction run. Default: UINT_MAX + unsigned int max_merge_width; + + // The size amplification is defined as the amount (in percentage) of + // additional storage needed to store a single byte of data in the database. + // For example, a size amplification of 2% means that a database that + // contains 100 bytes of user-data may occupy upto 102 bytes of + // physical storage. By this definition, a fully compacted database has + // a size amplification of 0%. Rocksdb uses the following heuristic + // to calculate size amplification: it assumes that all files excluding + // the earliest file contribute to the size amplification. + // Default: 200, which means that a 100 byte database could require upto + // 300 bytes of storage. + unsigned int max_size_amplification_percent; + + // If this option is set to be -1 (the default value), all the output files + // will follow compression type specified. + // + // If this option is not negative, we will try to make sure compressed + // size is just above this value. In normal cases, at least this percentage + // of data will be compressed. + // When we are compacting to a new file, here is the criteria whether + // it needs to be compressed: assuming here are the list of files sorted + // by generation time: + // A1...An B1...Bm C1...Ct + // where A1 is the newest and Ct is the oldest, and we are going to compact + // B1...Bm, we calculate the total size of all the files as total_size, as + // well as the total size of C1...Ct as total_C, the compaction output file + // will be compressed iff + // total_C / total_size < this percentage + int compression_size_percent; + + // The algorithm used to stop picking files into a single compaction run + // Default: kCompactionStopStyleTotalSize + CompactionStopStyle stop_style; + + // Default set of parameters + CompactionOptionsUniversal() : + size_ratio(1), + min_merge_width(2), + max_merge_width(UINT_MAX), + max_size_amplification_percent(200), + compression_size_percent(-1), + stop_style(kCompactionStopStyleTotalSize) { + } +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h new file mode 100644 index 00000000..2cfb731f --- /dev/null +++ b/include/rocksdb/write_batch.h @@ -0,0 +1,112 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch holds a collection of updates to apply atomically to a DB. +// +// The updates are applied in the order in which they are added +// to the WriteBatch. For example, the value of "key" will be "v3" +// after the following batch is written: +// +// batch.Put("key", "v1"); +// batch.Delete("key"); +// batch.Put("key", "v2"); +// batch.Put("key", "v3"); +// +// Multiple threads can invoke const methods on a WriteBatch without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same WriteBatch must use +// external synchronization. + +#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ +#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ + +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class Slice; +struct SliceParts; + +class WriteBatch { + public: + explicit WriteBatch(size_t reserved_bytes = 0); + ~WriteBatch(); + + // Store the mapping "key->value" in the database. + void Put(const Slice& key, const Slice& value); + + // Variant of Put() that gathers output like writev(2). The key and value + // that will be written to the database are concatentations of arrays of + // slices. + void Put(const SliceParts& key, const SliceParts& value); + + // Merge "value" with the existing value of "key" in the database. + // "key->merge(existing, value)" + void Merge(const Slice& key, const Slice& value); + + // If the database contains a mapping for "key", erase it. Else do nothing. + void Delete(const Slice& key); + + // Append a blob of arbitrary size to the records in this batch. The blob will + // be stored in the transaction log but not in any other file. In particular, + // it will not be persisted to the SST files. When iterating over this + // WriteBatch, WriteBatch::Handler::LogData will be called with the contents + // of the blob as it is encountered. Blobs, puts, deletes, and merges will be + // encountered in the same order in thich they were inserted. The blob will + // NOT consume sequence number(s) and will NOT increase the count of the batch + // + // Example application: add timestamps to the transaction log for use in + // replication. + void PutLogData(const Slice& blob); + + // Clear all updates buffered in this batch. + void Clear(); + + // Support for iterating over the contents of a batch. + class Handler { + public: + virtual ~Handler(); + virtual void Put(const Slice& key, const Slice& value) = 0; + // Merge and LogData are not pure virtual. Otherwise, we would break + // existing clients of Handler on a source code level. The default + // implementation of Merge simply throws a runtime exception. + virtual void Merge(const Slice& key, const Slice& value); + // The default implementation of LogData does nothing. + virtual void LogData(const Slice& blob); + virtual void Delete(const Slice& key) = 0; + // Continue is called by WriteBatch::Iterate. If it returns false, + // iteration is halted. Otherwise, it continues iterating. The default + // implementation always returns true. + virtual bool Continue(); + }; + Status Iterate(Handler* handler) const; + + // Retrieve the serialized version of this batch. + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } + + // Returns the number of updates in the batch + int Count() const; + + // Constructor with a serialized string object + explicit WriteBatch(std::string rep): rep_(rep) {} + + private: + friend class WriteBatchInternal; + + std::string rep_; // See comment in write_batch.cc for the format of rep_ + + // Intentionally copyable +}; + +} // namespace rocksdb + +#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ diff --git a/include/utilities/backupable_db.h b/include/utilities/backupable_db.h new file mode 100644 index 00000000..8f85e061 --- /dev/null +++ b/include/utilities/backupable_db.h @@ -0,0 +1,212 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +#include +#include +#include + +namespace rocksdb { + +struct BackupableDBOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // If share_table_files == true, backup will assume that table files with + // same name have the same contents. This enables incremental backups and + // avoids unnecessary data copies. + // If share_table_files == false, each backup will be on its own and will + // not share any data with other backups. + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // Default: 0 + uint64_t backup_rate_limit; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // Default: 0 + uint64_t restore_rate_limit; + + void Dump(Logger* logger) const; + + explicit BackupableDBOptions(const std::string& _backup_dir, + Env* _backup_env = nullptr, + bool _share_table_files = true, + Logger* _info_log = nullptr, bool _sync = true, + bool _destroy_old_data = false, + bool _backup_log_files = true, + uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit) {} +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupableDBOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id; + int64_t timestamp; + uint64_t size; + + BackupInfo() {} + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) + : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} +}; + +// Please see the documentation in BackupableDB and RestoreBackupableDB +class BackupEngine { + public: + virtual ~BackupEngine() {} + + static BackupEngine* NewBackupEngine(Env* db_env, + const BackupableDBOptions& options); + + virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0; + virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + virtual Status DeleteBackup(BackupID backup_id) = 0; + virtual void StopBackup() = 0; + + virtual void GetBackupInfo(std::vector* backup_info) = 0; + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options = RestoreOptions()) = 0; +}; + +// Stack your DB with BackupableDB to be able to backup the DB +class BackupableDB : public StackableDB { + public: + // BackupableDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + // + // BackupableDB ownes the pointer `DB* db` now. You should not delete it or + // use it after the invocation of BackupableDB + BackupableDB(DB* db, const BackupableDBOptions& options); + virtual ~BackupableDB(); + + // Captures the state of the database in the latest backup + // NOT a thread safe call + Status CreateNewBackup(bool flush_before_backup = false); + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediatelly, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up + // next time you create BackupableDB or RestoreBackupableDB. + void StopBackup(); + + private: + BackupEngine* backup_engine_; +}; + +// Use this class to access information about backups and restore from them +class RestoreBackupableDB { + public: + RestoreBackupableDB(Env* db_env, const BackupableDBOptions& options); + ~RestoreBackupableDB(); + + // Returns info about backups in backup_info + void GetBackupInfo(std::vector* backup_info); + + // restore from backup with backup_id + // IMPORTANT -- if options_.share_table_files == true and you restore DB + // from some backup that is not the latest, and you start creating new + // backups from the new DB, they will probably fail + // + // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. + // If you add new data to the DB and try creating a new backup now, the + // database will diverge from backups 4 and 5 and the new backup will fail. + // If you want to create new backup, you will first have to delete backups 4 + // and 5. + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()); + + // restore from the latest backup + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()); + // deletes old backups, keeping latest num_backups_to_keep alive + Status PurgeOldBackups(uint32_t num_backups_to_keep); + // deletes a specific backup + Status DeleteBackup(BackupID backup_id); + + private: + BackupEngine* backup_engine_; +}; + +} // rocksdb namespace diff --git a/include/utilities/geo_db.h b/include/utilities/geo_db.h new file mode 100644 index 00000000..8b3e44b0 --- /dev/null +++ b/include/utilities/geo_db.h @@ -0,0 +1,103 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include +#include + +#include "utilities/stackable_db.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// +// Configurable options needed for setting up a Geo database +// +struct GeoDBOptions { + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { } +}; + +// +// A position in the earth's geoid +// +class GeoPosition { + public: + double latitude; + double longitude; + + explicit GeoPosition(double la = 0, double lo = 0) : + latitude(la), longitude(lo) { + } +}; + +// +// Description of an object on the Geoid. It is located by a GPS location, +// and is identified by the id. The value associated with this object is +// an opaque string 'value'. Different objects identified by unique id's +// can have the same gps-location associated with them. +// +class GeoObject { + public: + GeoPosition position; + std::string id; + std::string value; + + GeoObject() {} + + GeoObject(const GeoPosition& pos, const std::string& i, + const std::string& val) : + position(pos), id(i), value(val) { + } +}; + +// +// Stack your DB with GeoDB to be able to get geo-spatial support +// +class GeoDB : public StackableDB { + public: + // GeoDBOptions have to be the same as the ones used in a previous + // incarnation of the DB + // + // GeoDB owns the pointer `DB* db` now. You should not delete it or + // use it after the invocation of GeoDB + // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {} + GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {} + virtual ~GeoDB() {} + + // Insert a new object into the location database. The object is + // uniquely identified by the id. If an object with the same id already + // exists in the db, then the old one is overwritten by the new + // object being inserted here. + virtual Status Insert(const GeoObject& object) = 0; + + // Retrieve the value of the object located at the specified GPS + // location and is identified by the 'id'. + virtual Status GetByPosition(const GeoPosition& pos, + const Slice& id, std::string* value) = 0; + + // Retrieve the value of the object identified by the 'id'. This method + // could be potentially slower than GetByPosition + virtual Status GetById(const Slice& id, GeoObject* object) = 0; + + // Delete the specified object + virtual Status Remove(const Slice& id) = 0; + + // Returns a list of all items within a circular radius from the + // specified gps location. If 'number_of_values' is specified, + // then this call returns at most that many number of objects. + // The radius is specified in 'meters'. + virtual Status SearchRadial(const GeoPosition& pos, + double radius, + std::vector* values, + int number_of_values = INT_MAX) = 0; +}; + +} // namespace rocksdb diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h new file mode 100644 index 00000000..370c920a --- /dev/null +++ b/include/utilities/stackable_db.h @@ -0,0 +1,165 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/db.h" + +namespace rocksdb { + +// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d +class StackableDB : public DB { + public: + // StackableDB is the owner of db now! + explicit StackableDB(DB* db) : db_(db) {} + + ~StackableDB() { + delete db_; + } + + virtual DB* GetBaseDB() { + return db_; + } + + virtual Status Put(const WriteOptions& options, + const Slice& key, + const Slice& val) override { + return db_->Put(options, key, val); + } + + virtual Status Get(const ReadOptions& options, + const Slice& key, + std::string* value) override { + return db_->Get(options, key, value); + } + + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) + override { + return db_->MultiGet(options, keys, values); + } + + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) override { + return db_->KeyMayExist(options, key, value, value_found); + } + + virtual Status Delete(const WriteOptions& wopts, const Slice& key) override { + return db_->Delete(wopts, key); + } + + virtual Status Merge(const WriteOptions& options, + const Slice& key, + const Slice& value) override { + return db_->Merge(options, key, value); + } + + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) + override { + return db_->Write(opts, updates); + } + + virtual Iterator* NewIterator(const ReadOptions& opts) override { + return db_->NewIterator(opts); + } + + virtual const Snapshot* GetSnapshot() override { + return db_->GetSnapshot(); + } + + virtual void ReleaseSnapshot(const Snapshot* snapshot) override { + return db_->ReleaseSnapshot(snapshot); + } + + virtual bool GetProperty(const Slice& property, std::string* value) + override { + return db_->GetProperty(property, value); + } + + virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) + override { + return db_->GetApproximateSizes(r, n, sizes); + } + + virtual Status CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) override { + return db_->CompactRange(begin, end, reduce_level, target_level); + } + + virtual int NumberLevels() override { + return db_->NumberLevels(); + } + + virtual int MaxMemCompactionLevel() override { + return db_->MaxMemCompactionLevel(); + } + + virtual int Level0StopWriteTrigger() override { + return db_->Level0StopWriteTrigger(); + } + + virtual const std::string& GetName() const override { + return db_->GetName(); + } + + virtual Env* GetEnv() const override { + return db_->GetEnv(); + } + + virtual const Options& GetOptions() const override { + return db_->GetOptions(); + } + + virtual Status Flush(const FlushOptions& fopts) override { + return db_->Flush(fopts); + } + + virtual Status DisableFileDeletions() override { + return db_->DisableFileDeletions(); + } + + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + return db_->GetLiveFiles(vec, mfs, flush_memtable); + } + + virtual SequenceNumber GetLatestSequenceNumber() const override { + return db_->GetLatestSequenceNumber(); + } + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + return db_->GetSortedWalFiles(files); + } + + virtual Status DeleteFile(std::string name) override { + return db_->DeleteFile(name); + } + + virtual Status GetDbIdentity(std::string& identity) { + return db_->GetDbIdentity(identity); + } + + virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return db_->GetPropertiesOfAllTables(props); + } + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) override { + return db_->GetUpdatesSince(seq_number, iter, read_options); + } + + protected: + DB* db_; +}; + +} // namespace rocksdb diff --git a/include/utilities/utility_db.h b/include/utilities/utility_db.h new file mode 100644 index 00000000..1a7a269d --- /dev/null +++ b/include/utilities/utility_db.h @@ -0,0 +1,50 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "stackable_db.h" + +namespace rocksdb { + +// This class contains APIs to open rocksdb with specific support eg. TTL +class UtilityDB { + + public: + // Open the database with TTL support. + // + // USE-CASES: + // This API should be used to open the db when key-values inserted are + // meant to be removed from the db in a non-strict 'ttl' amount of time + // Therefore, this guarantees that key-values inserted will remain in the + // db for >= ttl amount of time and the db will make efforts to remove the + // key-values as soon as possible after ttl seconds of their insertion. + // + // BEHAVIOUR: + // TTL is accepted in seconds + // (int32_t)Timestamp(creation) is suffixed to values in Put internally + // Expired TTL values deleted in compaction only:(Timestamp+ttl=5 + // read_only=true opens in the usual read-only mode. Compactions will not be + // triggered(neither manual nor automatic), so no expired entries removed + // + // CONSTRAINTS: + // Not specifying/passing or non-positive TTL behaves like TTL = infinity + // + // !!!WARNING!!!: + // Calling DB::Open directly to re-open a db created by this API will get + // corrupt values(timestamp suffixed) and no ttl effect will be there + // during the second Open, so use this API consistently to open the db + // Be careful when passing ttl with a small positive value because the + // whole database may be deleted in a small amount of time + static Status OpenTtlDB(const Options& options, + const std::string& name, + StackableDB** dbptr, + int32_t ttl = 0, + bool read_only = false); +}; + +} // namespace rocksdb diff --git a/java/Makefile b/java/Makefile new file mode 100644 index 00000000..10dd4f11 --- /dev/null +++ b/java/Makefile @@ -0,0 +1,24 @@ +NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions +NATIVE_INCLUDE = ./include +ROCKSDB_JAR = rocksdbjni.jar + +clean: + -find . -name "*.class" -exec rm {} \; + -find . -name "hs*.log" -exec rm {} \; + rm -f $(ROCKSDB_JAR) + +java: + javac org/rocksdb/*.java + jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class + javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) + +sample: + javac -cp $(ROCKSDB_JAR) RocksDBSample.java + @rm -rf /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni_not_found + java -ea -Djava.library.path=.:../ -cp ".:./*" RocksDBSample /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni_not_found + +test: + java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java new file mode 100644 index 00000000..e6421778 --- /dev/null +++ b/java/RocksDBSample.java @@ -0,0 +1,118 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +import java.util.*; +import java.lang.*; +import org.rocksdb.*; +import java.io.IOException; + +public class RocksDBSample { + static { + System.loadLibrary("rocksdbjni"); + } + + public static void main(String[] args) { + if (args.length < 1) { + System.out.println("usage: RocksDBSample db_path"); + return; + } + String db_path = args[0]; + String db_path_not_found = db_path + "_not_found"; + + System.out.println("RocksDBSample"); + RocksDB db = null; + Options options = new Options(); + try { + db = RocksDB.open(options, db_path_not_found); + assert(false); + } catch (RocksDBException e) { + System.out.format("caught the expceted exception -- %s\n", e); + assert(db == null); + } + + options.setCreateIfMissing(true); + try { + db = RocksDB.open(options, db_path_not_found); + db.put("hello".getBytes(), "world".getBytes()); + byte[] value = db.get("hello".getBytes()); + assert("world".equals(new String(value))); + } catch (RocksDBException e) { + System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e); + assert(db == null); + assert(false); + } + + // be sure to release the c++ pointer + options.dispose(); + db.close(); + + try { + db = RocksDB.open(db_path); + db.put("hello".getBytes(), "world".getBytes()); + byte[] value = db.get("hello".getBytes()); + System.out.format("Get('hello') = %s\n", + new String(value)); + + for (int i = 1; i <= 9; ++i) { + for (int j = 1; j <= 9; ++j) { + db.put(String.format("%dx%d", i, j).getBytes(), + String.format("%d", i * j).getBytes()); + } + } + + for (int i = 1; i <= 9; ++i) { + for (int j = 1; j <= 9; ++j) { + System.out.format("%s ", new String(db.get( + String.format("%dx%d", i, j).getBytes()))); + } + System.out.println(""); + } + + value = db.get("1x1".getBytes()); + assert(value != null); + value = db.get("world".getBytes()); + assert(value == null); + + byte[] testKey = "asdf".getBytes(); + byte[] testValue = + "asdfghjkl;'?> insufficientArray.length); + len = db.get("asdfjkl;".getBytes(), enoughArray); + assert(len == RocksDB.NOT_FOUND); + len = db.get(testKey, enoughArray); + assert(len == testValue.length); + + db.remove(testKey); + len = db.get(testKey, enoughArray); + assert(len == RocksDB.NOT_FOUND); + + // repeat the test with WriteOptions + WriteOptions writeOpts = new WriteOptions(); + writeOpts.setSync(true); + writeOpts.setDisableWAL(true); + db.put(writeOpts, testKey, testValue); + len = db.get(testKey, enoughArray); + assert(len == testValue.length); + assert(new String(testValue).equals( + new String(enoughArray, 0, len))); + writeOpts.dispose(); + } catch (RocksDBException e) { + System.err.println(e); + } + if (db != null) { + db.close(); + } + } +} diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java new file mode 100644 index 00000000..af759ae8 --- /dev/null +++ b/java/org/rocksdb/Options.java @@ -0,0 +1,69 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Options to control the behavior of a database. It will be used + * during the creation of a RocksDB (i.e., RocksDB::Open()). + * + * Note that dispose() must be called before an Options instance + * become out-of-scope to release the allocated memory in c++. + */ +public class Options { + /** + * Construct options for opening a RocksDB. + * + * This constructor will create (by allocating a block of memory) + * an rocksdb::Options in the c++ side. + */ + public Options() { + nativeHandle_ = 0; + newOptions(); + } + + /** + * If this value is set to true, then the database will be created + * if it is missing during RocksDB::Open(). + * Default: false + * + * @param flag a flag indicating whether to create a database the + * specified database in RocksDB::Open() operation is missing. + * @see RocksDB::Open() + */ + public void setCreateIfMissing(boolean flag) { + assert(nativeHandle_ != 0); + setCreateIfMissing(nativeHandle_, flag); + } + + /** + * Return true if the create_if_missing flag is set to true. + * If true, the database will be created if it is missing. + * + * @return return true if the create_if_missing flag is set to true. + * @see setCreateIfMissing() + */ + public boolean craeteIfMissing() { + assert(nativeHandle_ != 0); + return createIfMissing(nativeHandle_); + } + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + public synchronized void dispose() { + if (nativeHandle_ != 0) { + dispose0(); + } + } + + private native void newOptions(); + private native void dispose0(); + private native void setCreateIfMissing(long handle, boolean flag); + private native boolean createIfMissing(long handle); + + long nativeHandle_; +} diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java new file mode 100644 index 00000000..bdab8be1 --- /dev/null +++ b/java/org/rocksdb/RocksDB.java @@ -0,0 +1,163 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.lang.*; +import java.util.*; +import java.io.Closeable; +import java.io.IOException; + +/** + * A RocksDB is a persistent ordered map from keys to values. It is safe for + * concurrent access from multiple threads without any external synchronization. + * All methods of this class could potentially throw RocksDBException, which + * indicates sth wrong at the rocksdb library side and the call failed. + */ +public class RocksDB { + public static final int NOT_FOUND = -1; + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the default options w/ createIfMissing + * set to true. + * + * @param path the path to the rocksdb. + * @param status an out value indicating the status of the Open(). + * @return a rocksdb instance on success, null if the specified rocksdb can + * not be opened. + * + * @see Options.setCreateIfMissing() + * @see Options.createIfMissing() + */ + public static RocksDB open(String path) throws RocksDBException { + RocksDB db = new RocksDB(); + db.open0(path); + return db; + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the specified options and db path. + */ + public static RocksDB open(Options options, String path) + throws RocksDBException { + RocksDB db = new RocksDB(); + db.open(options.nativeHandle_, path); + return db; + } + + public synchronized void close() { + if (nativeHandle_ != 0) { + close0(); + } + } + + /** + * Set the database entry for "key" to "value". + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + */ + public void put(byte[] key, byte[] value) throws RocksDBException { + put(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Set the database entry for "key" to "value". + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + */ + public void put(WriteOptions writeOpts, byte[] key, byte[] value) + throws RocksDBException { + put(nativeHandle_, writeOpts.nativeHandle_, key, key.length, value, value.length); + } + + /** + * Get the value associated with the specified key. + * + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + */ + public int get(byte[] key, byte[] value) throws RocksDBException { + return get(nativeHandle_, key, key.length, value, value.length); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param key the key retrieve the value. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @see RocksDBException + */ + public byte[] get(byte[] key) throws RocksDBException { + return get(nativeHandle_, key, key.length); + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + */ + public void remove(byte[] key) throws RocksDBException { + remove(nativeHandle_, key, key.length); + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + */ + public void remove(WriteOptions writeOpt, byte[] key) + throws RocksDBException { + remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length); + } + + @Override protected void finalize() { + close(); + } + + /** + * Private constructor. + */ + private RocksDB() { + nativeHandle_ = 0; + } + + // native methods + private native void open0(String path) throws RocksDBException; + private native void open( + long optionsHandle, String path) throws RocksDBException; + private native void put( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + private native void put( + long handle, long writeOptHandle, + byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + private native int get( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + private native byte[] get( + long handle, byte[] key, int keyLen) throws RocksDBException; + private native void remove( + long handle, byte[] key, int keyLen) throws RocksDBException; + private native void remove( + long handle, long writeOptHandle, + byte[] key, int keyLen) throws RocksDBException; + private native void close0(); + + private long nativeHandle_; +} diff --git a/java/org/rocksdb/RocksDBException.java b/java/org/rocksdb/RocksDBException.java new file mode 100644 index 00000000..e426e03e --- /dev/null +++ b/java/org/rocksdb/RocksDBException.java @@ -0,0 +1,24 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.lang.*; +import java.util.*; + +/** + * A RocksDBException encapsulates the error of an operation. This exception + * type is used to describe an internal error from the c++ rocksdb library. + */ +public class RocksDBException extends Exception { + /** + * The private construct used by a set of public static factory method. + * + * @param msg the specified error message. + */ + public RocksDBException(String msg) { + super(msg); + } +} diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java new file mode 100644 index 00000000..acacee3f --- /dev/null +++ b/java/org/rocksdb/WriteBatch.java @@ -0,0 +1,121 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.lang.*; +import java.util.*; + +/** + * WriteBatch holds a collection of updates to apply atomically to a DB. + * + * The updates are applied in the order in which they are added + * to the WriteBatch. For example, the value of "key" will be "v3" + * after the following batch is written: + * + * batch.put("key", "v1"); + * batch.remove("key"); + * batch.put("key", "v2"); + * batch.put("key", "v3"); + * + * Multiple threads can invoke const methods on a WriteBatch without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same WriteBatch must use + * external synchronization. + */ +public class WriteBatch { + public WriteBatch() { + nativeHandle_ = 0; + newWriteBatch(0); + } + + public WriteBatch(int reserved_bytes) { + nativeHandle_ = 0; + newWriteBatch(reserved_bytes); + } + + /** + * Returns the number of updates in the batch. + */ + public native int count(); + + /** + * Store the mapping "key->value" in the database. + */ + public void put(byte[] key, byte[] value) { + put(key, key.length, value, value.length); + } + + /** + * Merge "value" with the existing value of "key" in the database. + * "key->merge(existing, value)" + */ + public void merge(byte[] key, byte[] value) { + merge(key, key.length, value, value.length); + } + + /** + * If the database contains a mapping for "key", erase it. Else do nothing. + */ + public void remove(byte[] key) { + remove(key, key.length); + } + + /** + * Append a blob of arbitrary size to the records in this batch. The blob will + * be stored in the transaction log but not in any other file. In particular, + * it will not be persisted to the SST files. When iterating over this + * WriteBatch, WriteBatch::Handler::LogData will be called with the contents + * of the blob as it is encountered. Blobs, puts, deletes, and merges will be + * encountered in the same order in thich they were inserted. The blob will + * NOT consume sequence number(s) and will NOT increase the count of the batch + * + * Example application: add timestamps to the transaction log for use in + * replication. + */ + public void putLogData(byte[] blob) { + putLogData(blob, blob.length); + } + + /** + * Clear all updates buffered in this batch + */ + public native void clear(); + + /** + * Delete the c++ side pointer. + */ + public synchronized void dispose() { + if (nativeHandle_ != 0) { + dispose0(); + } + } + + @Override protected void finalize() { + dispose(); + } + + private native void newWriteBatch(int reserved_bytes); + private native void put(byte[] key, int keyLen, + byte[] value, int valueLen); + private native void merge(byte[] key, int keyLen, + byte[] value, int valueLen); + private native void remove(byte[] key, int keyLen); + private native void putLogData(byte[] blob, int blobLen); + private native void dispose0(); + + private long nativeHandle_; +} + +/** + * Package-private class which provides java api to access + * c++ WriteBatchInternal. + */ +class WriteBatchInternal { + static native void setSequence(WriteBatch batch, long sn); + static native long sequence(WriteBatch batch); + static native void append(WriteBatch b1, WriteBatch b2); +} + diff --git a/java/org/rocksdb/WriteBatchTest.java b/java/org/rocksdb/WriteBatchTest.java new file mode 100644 index 00000000..283caca6 --- /dev/null +++ b/java/org/rocksdb/WriteBatchTest.java @@ -0,0 +1,125 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +package org.rocksdb; + +import java.util.*; +import java.lang.*; +import java.io.UnsupportedEncodingException; + +/** + * This class mimics the db/write_batch_test.cc in the c++ rocksdb library. + */ +public class WriteBatchTest { + static { + System.loadLibrary("rocksdbjni"); + } + + public static void main(String args[]) { + System.out.println("Testing WriteBatchTest.Empty ==="); + Empty(); + + System.out.println("Testing WriteBatchTest.Multiple ==="); + Multiple(); + + System.out.println("Testing WriteBatchTest.Append ==="); + Append(); + + System.out.println("Testing WriteBatchTest.Blob ==="); + Blob(); + + // The following tests have not yet ported. + // Continue(); + // PutGatherSlices(); + + System.out.println("Passed all WriteBatchTest!"); + } + + static void Empty() { + WriteBatch batch = new WriteBatch(); + assert(batch.count() == 0); + } + + static void Multiple() { + try { + WriteBatch batch = new WriteBatch(); + batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + batch.remove("box".getBytes("US-ASCII")); + batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); + WriteBatchInternal.setSequence(batch, 100); + assert(100 == WriteBatchInternal.sequence(batch)); + assert(3 == batch.count()); + assert(new String("Put(baz, boo)@102" + + "Delete(box)@101" + + "Put(foo, bar)@100") + .equals(new String(getContents(batch), "US-ASCII"))); + } catch (UnsupportedEncodingException e) { + System.err.println(e); + assert(false); + } + } + + static void Append() { + WriteBatch b1 = new WriteBatch(); + WriteBatch b2 = new WriteBatch(); + WriteBatchInternal.setSequence(b1, 200); + WriteBatchInternal.setSequence(b2, 300); + WriteBatchInternal.append(b1, b2); + assert(getContents(b1).length == 0); + assert(b1.count() == 0); + try { + b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); + WriteBatchInternal.append(b1, b2); + assert("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII"))); + assert(1 == b1.count()); + b2.clear(); + b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); + WriteBatchInternal.append(b1, b2); + assert(new String("Put(a, va)@200" + + "Put(b, vb)@201") + .equals(new String(getContents(b1), "US-ASCII"))); + assert(2 == b1.count()); + b2.remove("foo".getBytes("US-ASCII")); + WriteBatchInternal.append(b1, b2); + assert(new String("Put(a, va)@200" + + "Put(b, vb)@202" + + "Put(b, vb)@201" + + "Delete(foo)@203") + .equals(new String(getContents(b1), "US-ASCII"))); + assert(4 == b1.count()); + } catch (UnsupportedEncodingException e) { + System.err.println(e); + assert(false); + } + } + + static void Blob() { + WriteBatch batch = new WriteBatch(); + try { + batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); + batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); + batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); + batch.putLogData("blob1".getBytes("US-ASCII")); + batch.remove("k2".getBytes("US-ASCII")); + batch.putLogData("blob2".getBytes("US-ASCII")); + batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + assert(5 == batch.count()); + assert(new String("Merge(foo, bar)@4" + + "Put(k1, v1)@0" + + "Delete(k2)@3" + + "Put(k2, v2)@1" + + "Put(k3, v3)@2") + .equals(new String(getContents(batch), "US-ASCII"))); + } catch (UnsupportedEncodingException e) { + System.err.println(e); + assert(false); + } + } + + static native byte[] getContents(WriteBatch batch); +} diff --git a/java/org/rocksdb/WriteOptions.java b/java/org/rocksdb/WriteOptions.java new file mode 100644 index 00000000..26f0e2b7 --- /dev/null +++ b/java/org/rocksdb/WriteOptions.java @@ -0,0 +1,96 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Options that control write operations. + * + * Note that developers should call WriteOptions.dispose() to release the + * c++ side memory before a WriteOptions instance runs out of scope. + */ +public class WriteOptions { + public WriteOptions() { + nativeHandle_ = 0; + newWriteOptions(); + } + + public synchronized void dispose() { + if (nativeHandle_ != 0) { + dispose0(nativeHandle_); + } + } + + /** + * If true, the write will be flushed from the operating system + * buffer cache (by calling WritableFile::Sync()) before the write + * is considered complete. If this flag is true, writes will be + * slower. + * + * If this flag is false, and the machine crashes, some recent + * writes may be lost. Note that if it is just the process that + * crashes (i.e., the machine does not reboot), no writes will be + * lost even if sync==false. + * + * In other words, a DB write with sync==false has similar + * crash semantics as the "write()" system call. A DB write + * with sync==true has similar crash semantics to a "write()" + * system call followed by "fdatasync()". + * + * Default: false + */ + public void setSync(boolean flag) { + setSync(nativeHandle_, flag); + } + + /** + * If true, the write will be flushed from the operating system + * buffer cache (by calling WritableFile::Sync()) before the write + * is considered complete. If this flag is true, writes will be + * slower. + * + * If this flag is false, and the machine crashes, some recent + * writes may be lost. Note that if it is just the process that + * crashes (i.e., the machine does not reboot), no writes will be + * lost even if sync==false. + * + * In other words, a DB write with sync==false has similar + * crash semantics as the "write()" system call. A DB write + * with sync==true has similar crash semantics to a "write()" + * system call followed by "fdatasync()". + */ + public boolean sync() { + return sync(nativeHandle_); + } + + /** + * If true, writes will not first go to the write ahead log, + * and the write may got lost after a crash. + */ + public void setDisableWAL(boolean flag) { + setDisableWAL(nativeHandle_, flag); + } + + /** + * If true, writes will not first go to the write ahead log, + * and the write may got lost after a crash. + */ + public boolean disableWAL() { + return disableWAL(nativeHandle_); + } + + @Override protected void finalize() { + dispose(); + } + + private native void newWriteOptions(); + private native void setSync(long handle, boolean flag); + private native boolean sync(long handle); + private native void setDisableWAL(long handle, boolean flag); + private native boolean disableWAL(long handle); + private native void dispose0(long handle); + + protected long nativeHandle_; +} diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc new file mode 100644 index 00000000..69224f6d --- /dev/null +++ b/java/rocksjni/options.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for rocksdb::Options. + +#include +#include +#include +#include + +#include "include/org_rocksdb_Options.h" +#include "include/org_rocksdb_WriteOptions.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" + +/* + * Class: org_rocksdb_Options + * Method: newOptions + * Signature: ()V + */ +void Java_org_rocksdb_Options_newOptions(JNIEnv* env, jobject jobj) { + rocksdb::Options* op = new rocksdb::Options(); + rocksdb::OptionsJni::setHandle(env, jobj, op); +} + +/* + * Class: org_rocksdb_Options + * Method: dispose0 + * Signature: ()V + */ +void Java_org_rocksdb_Options_dispose0(JNIEnv* env, jobject jobj) { + rocksdb::Options* op = rocksdb::OptionsJni::getHandle(env, jobj); + delete op; + + rocksdb::OptionsJni::setHandle(env, jobj, op); +} + +/* + * Class: org_rocksdb_Options + * Method: setCreateIfMissing + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setCreateIfMissing( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + reinterpret_cast(jhandle)->create_if_missing = flag; +} + +/* + * Class: org_rocksdb_Options + * Method: createIfMissing + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_createIfMissing( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->create_if_missing; +} + +////////////////////////////////////////////////////////////////////////////// +// WriteOptions + +/* + * Class: org_rocksdb_WriteOptions + * Method: newWriteOptions + * Signature: ()V + */ +void Java_org_rocksdb_WriteOptions_newWriteOptions( + JNIEnv* env, jobject jwrite_options) { + rocksdb::WriteOptions* op = new rocksdb::WriteOptions(); + rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op); +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: dispose0 + * Signature: ()V + */ +void Java_org_rocksdb_WriteOptions_dispose0( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + auto write_options = reinterpret_cast(jhandle); + delete write_options; + + rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr); +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: setSync + * Signature: (JZ)V + */ +void Java_org_rocksdb_WriteOptions_setSync( + JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->sync = jflag; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: sync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteOptions_sync( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + return reinterpret_cast(jhandle)->sync; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: setDisableWAL + * Signature: (JZ)V + */ +void Java_org_rocksdb_WriteOptions_setDisableWAL( + JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->disableWAL = jflag; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: disableWAL + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteOptions_disableWAL( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + return reinterpret_cast(jhandle)->disableWAL; +} + + diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h new file mode 100644 index 00000000..5b0524ae --- /dev/null +++ b/java/rocksjni/portal.h @@ -0,0 +1,174 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +// This file is designed for caching those frequently used IDs and provide +// efficient portal (i.e, a set of static functions) to access java code +// from c++. + +#ifndef JAVA_ROCKSJNI_PORTAL_H_ +#define JAVA_ROCKSJNI_PORTAL_H_ + +#include +#include "rocksdb/db.h" + +namespace rocksdb { + +// The portal class for org.rocksdb.RocksDB +class RocksDBJni { + public: + // Get the java class id of org.rocksdb.RocksDB. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/RocksDB"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.RocksDB + // that stores the pointer to rocksdb::DB. + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::DB of the specified org.rocksdb.RocksDB. + static rocksdb::DB* getHandle(JNIEnv* env, jobject jdb) { + return reinterpret_cast( + env->GetLongField(jdb, getHandleFieldID(env))); + } + + // Pass the rocksdb::DB pointer to the java side. + static void setHandle(JNIEnv* env, jobject jdb, rocksdb::DB* db) { + env->SetLongField( + jdb, getHandleFieldID(env), + reinterpret_cast(db)); + } +}; + +// The portal class for org.rocksdb.RocksDBException +class RocksDBExceptionJni { + public: + // Get the jclass of org.rocksdb.RocksDBException + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/RocksDBException"); + assert(jclazz != nullptr); + return jclazz; + } + + // Create and throw a java exception by converting the input + // Status to an RocksDBException. + // + // In case s.ok() is true, then this function will not throw any + // exception. + static void ThrowNew(JNIEnv* env, Status s) { + if (s.ok()) { + return; + } + jstring msg = env->NewStringUTF(s.ToString().c_str()); + // get the constructor id of org.rocksdb.RocksDBException + static jmethodID mid = env->GetMethodID( + getJClass(env), "", "(Ljava/lang/String;)V"); + assert(mid != nullptr); + + env->Throw((jthrowable)env->NewObject(getJClass(env), mid, msg)); + } +}; + +class OptionsJni { + public: + // Get the java class id of org.rocksdb.Options. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/Options"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.Options + // that stores the pointer to rocksdb::Options + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::Options + static rocksdb::Options* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::Options pointer to the java side. + static void setHandle(JNIEnv* env, jobject jobj, rocksdb::Options* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class WriteOptionsJni { + public: + // Get the java class id of org.rocksdb.WriteOptions. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/WriteOptions"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get the field id of the member variable of org.rocksdb.WriteOptions + // that stores the pointer to rocksdb::WriteOptions + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::WriteOptions + static rocksdb::WriteOptions* getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); + } + + // Pass the rocksdb::WriteOptions pointer to the java side. + static void setHandle(JNIEnv* env, jobject jobj, rocksdb::WriteOptions* op) { + env->SetLongField( + jobj, getHandleFieldID(env), + reinterpret_cast(op)); + } +}; + +class WriteBatchJni { + public: + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/WriteBatch"); + assert(jclazz != nullptr); + return jclazz; + } + + static jfieldID getHandleFieldID(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "nativeHandle_", "J"); + assert(fid != nullptr); + return fid; + } + + // Get the pointer to rocksdb::WriteBatch of the specified + // org.rocksdb.WriteBatch. + static rocksdb::WriteBatch* getHandle(JNIEnv* env, jobject jwb) { + return reinterpret_cast( + env->GetLongField(jwb, getHandleFieldID(env))); + } + + // Pass the rocksdb::WriteBatch pointer to the java side. + static void setHandle(JNIEnv* env, jobject jwb, rocksdb::WriteBatch* wb) { + env->SetLongField( + jwb, getHandleFieldID(env), + reinterpret_cast(wb)); + } +}; +} // namespace rocksdb +#endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc new file mode 100644 index 00000000..ccd87105 --- /dev/null +++ b/java/rocksjni/rocksjni.cc @@ -0,0 +1,285 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::DB methods from Java side. + +#include +#include +#include +#include + +#include "include/org_rocksdb_RocksDB.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Open + +void rocksdb_open_helper( + JNIEnv* env, jobject java_db, jstring jdb_path, const rocksdb::Options& opt) { + rocksdb::DB* db; + + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + rocksdb::Status s = rocksdb::DB::Open(opt, db_path, &db); + env->ReleaseStringUTFChars(jdb_path, db_path); + + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, java_db, db); + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: open0 + * Signature: (Ljava/lang/String;)V + */ +void Java_org_rocksdb_RocksDB_open0( + JNIEnv* env, jobject jdb, jstring jdb_path) { + rocksdb::Options options; + options.create_if_missing = true; + + rocksdb_open_helper(env, jdb, jdb_path, options); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: open + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_RocksDB_open( + JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) { + auto options = reinterpret_cast(jopt_handle); + rocksdb_open_helper(env, jdb, jdb_path, *options); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Put + +void rocksdb_put_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + + jbyte* key = env->GetByteArrayElements(jkey, 0); + jbyte* value = env->GetByteArrayElements(jvalue, 0); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + + rocksdb::Status s = db->Put(write_options, key_slice, value_slice); + + // trigger java unref on key and value. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: put + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_RocksDB_put__J_3BI_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + + rocksdb_put_helper(env, db, default_write_options, + jkey, jkey_len, + jvalue, jvalue_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: put + * Signature: (JJ[BI[BI)V + */ +void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI( + JNIEnv* env, jobject jdb, + jlong jdb_handle, jlong jwrite_options_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast( + jwrite_options_handle); + + rocksdb_put_helper(env, db, *write_options, + jkey, jkey_len, + jvalue, jvalue_len); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Get + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (J[BI)[B + */ +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len) { + auto db = reinterpret_cast(jdb_handle); + + jboolean isCopy; + jbyte* key = env->GetByteArrayElements(jkey, &isCopy); + rocksdb::Slice key_slice( + reinterpret_cast(key), jkey_len); + + std::string value; + rocksdb::Status s = db->Get( + rocksdb::ReadOptions(), + key_slice, &value); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.IsNotFound()) { + return nullptr; + } + + if (s.ok()) { + jbyteArray jvalue = env->NewByteArray(value.size()); + env->SetByteArrayRegion( + jvalue, 0, value.size(), + reinterpret_cast(value.c_str())); + return jvalue; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + + return nullptr; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (J[BI[BI)I + */ +jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + static const int kNotFound = -1; + static const int kStatusError = -2; + auto db = reinterpret_cast(jdb_handle); + + jbyte* key = env->GetByteArrayElements(jkey, 0); + rocksdb::Slice key_slice( + reinterpret_cast(key), jkey_len); + + // TODO(yhchiang): we might save one memory allocation here by adding + // a DB::Get() function which takes preallocated jbyte* as input. + std::string cvalue; + rocksdb::Status s = db->Get( + rocksdb::ReadOptions(), key_slice, &cvalue); + + // trigger java unref on key. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (s.IsNotFound()) { + return kNotFound; + } else if (!s.ok()) { + // Here since we are throwing a Java exception from c++ side. + // As a result, c++ does not know calling this function will in fact + // throwing an exception. As a result, the execution flow will + // not stop here, and codes after this throw will still be + // executed. + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + + // Return a dummy const value to avoid compilation error, although + // java side might not have a chance to get the return value :) + return kStatusError; + } + + int cvalue_len = static_cast(cvalue.size()); + int length = std::min(jvalue_len, cvalue_len); + + env->SetByteArrayRegion( + jvalue, 0, length, + reinterpret_cast(cvalue.c_str())); + return cvalue_len; +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Delete() +void rocksdb_remove_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, + jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, 0); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + + rocksdb::Status s = db->Delete(write_options, key_slice); + + // trigger java unref on key and value. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } + return; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: remove + * Signature: (J[BI)V + */ +void Java_org_rocksdb_RocksDB_remove__J_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + + rocksdb_remove_helper(env, db, default_write_options, jkey, jkey_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: remove + * Signature: (JJ[BI)V + */ +void Java_org_rocksdb_RocksDB_remove__JJ_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jwrite_options, jbyteArray jkey, jint jkey_len) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast(jwrite_options); + + rocksdb_remove_helper(env, db, *write_options, jkey, jkey_len); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::~DB() + +/* + * Class: org_rocksdb_RocksDB + * Method: close0 + * Signature: ()V + */ +void Java_org_rocksdb_RocksDB_close0( + JNIEnv* env, jobject java_db) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, java_db); + assert(db != nullptr); + delete db; + + rocksdb::RocksDBJni::setHandle(env, java_db, nullptr); +} diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc new file mode 100644 index 00000000..f72c3ba6 --- /dev/null +++ b/java/rocksjni/write_batch.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::WriteBatch methods from Java side. +#include + +#include "include/org_rocksdb_WriteBatch.h" +#include "include/org_rocksdb_WriteBatchInternal.h" +#include "include/org_rocksdb_WriteBatchTest.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "db/memtable.h" +#include "rocksdb/write_batch.h" +#include "db/write_batch_internal.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "util/logging.h" +#include "util/testharness.h" + +/* + * Class: org_rocksdb_WriteBatch + * Method: newWriteBatch + * Signature: (I)V + */ +void Java_org_rocksdb_WriteBatch_newWriteBatch( + JNIEnv* env, jobject jobj, jint jreserved_bytes) { + rocksdb::WriteBatch* wb = new rocksdb::WriteBatch( + static_cast(jreserved_bytes)); + + rocksdb::WriteBatchJni::setHandle(env, jobj, wb); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: count + * Signature: ()I + */ +jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + return static_cast(wb->Count()); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: clear + * Signature: ()V + */ +void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + wb->Clear(); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: put + * Signature: ([BI[BI)V + */ +void Java_org_rocksdb_WriteBatch_put( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + wb->Put(key_slice, value_slice); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: merge + * Signature: ([BI[BI)V + */ +JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + wb->Merge(key_slice, value_slice); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: remove + * Signature: ([BI)V + */ +JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + wb->Delete(key_slice); + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: putLogData + * Signature: ([BI)V + */ +void Java_org_rocksdb_WriteBatch_putLogData( + JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + jbyte* blob = env->GetByteArrayElements(jblob, nullptr); + rocksdb::Slice blob_slice(reinterpret_cast(blob), jblob_len); + wb->PutLogData(blob_slice); + env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WriteBatch + * Method: dispose0 + * Signature: ()V + */ +void Java_org_rocksdb_WriteBatch_dispose0(JNIEnv* env, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + delete wb; + + rocksdb::WriteBatchJni::setHandle(env, jobj, nullptr); +} + +/* + * Class: org_rocksdb_WriteBatchInternal + * Method: setSequence + * Signature: (Lorg/rocksdb/WriteBatch;J)V + */ +void Java_org_rocksdb_WriteBatchInternal_setSequence( + JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + rocksdb::WriteBatchInternal::SetSequence( + wb, static_cast(jsn)); +} + +/* + * Class: org_rocksdb_WriteBatchInternal + * Method: sequence + * Signature: (Lorg/rocksdb/WriteBatch;)J + */ +jlong Java_org_rocksdb_WriteBatchInternal_sequence( + JNIEnv* env, jclass jclazz, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + return static_cast(rocksdb::WriteBatchInternal::Sequence(wb)); +} + +/* + * Class: org_rocksdb_WriteBatchInternal + * Method: append + * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V + */ +void Java_org_rocksdb_WriteBatchInternal_append( + JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) { + rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1); + assert(wb1 != nullptr); + rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2); + assert(wb2 != nullptr); + + rocksdb::WriteBatchInternal::Append(wb1, wb2); +} + +/* + * Class: org_rocksdb_WriteBatchTest + * Method: getContents + * Signature: (Lorg/rocksdb/WriteBatch;)[B + */ +jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( + JNIEnv* env, jclass jclazz, jobject jobj) { + rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(b != nullptr); + + // todo: Currently the following code is directly copied from + // db/write_bench_test.cc. It could be implemented in java once + // all the necessary components can be accessed via jni api. + + rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator()); + auto factory = std::make_shared(); + rocksdb::Options options; + options.memtable_factory = factory; + rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); + mem->Ref(); + std::string state; + rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, mem, &options); + int count = 0; + rocksdb::Iterator* iter = mem->NewIterator(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + rocksdb::ParsedInternalKey ikey; + memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); + ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case rocksdb::kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(rocksdb::NumberToString(ikey.sequence)); + } + delete iter; + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != rocksdb::WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + + jbyteArray jstate = env->NewByteArray(state.size()); + env->SetByteArrayRegion( + jstate, 0, state.size(), + reinterpret_cast(state.c_str())); + + return jstate; +} + diff --git a/linters/__phutil_library_init__.php b/linters/__phutil_library_init__.php new file mode 100644 index 00000000..4b8d3d13 --- /dev/null +++ b/linters/__phutil_library_init__.php @@ -0,0 +1,3 @@ + 2, + 'class' => + array( + 'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php', + 'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php', + 'PfffCppLinter' => 'cpp_linter/PfffCppLinter.php', + 'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php', + ), + 'function' => + array( + ), + 'xmap' => + array( + 'FacebookFbcodeLintEngine' => 'ArcanistLintEngine', + 'FbcodeCppLinter' => 'ArcanistLinter', + 'PfffCppLinter' => 'ArcanistLinter', + ), +)); diff --git a/linters/cpp_linter/ArcanistCpplintLinter.php b/linters/cpp_linter/ArcanistCpplintLinter.php new file mode 100644 index 00000000..b9c41375 --- /dev/null +++ b/linters/cpp_linter/ArcanistCpplintLinter.php @@ -0,0 +1,88 @@ +linterDir(), $bin); + if (!$err) { + return $this->linterDir().'/'.$bin; + } + + // Look for globally installed cpplint.py + list($err) = exec_manual('which %s', $bin); + if ($err) { + throw new ArcanistUsageException( + "cpplint.py does not appear to be installed on this system. Install ". + "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/". + "svn/trunk/cpplint/cpplint.py\"') ". + "in your .arcconfig to point to the directory where it resides. ". + "Also don't forget to chmod a+x cpplint.py!"); + } + + return $bin; + } + + public function lintPath($path) { + $bin = $this->getLintPath(); + $path = $this->rocksdbDir().'/'.$path; + + $f = new ExecFuture("%C $path", $bin); + + list($err, $stdout, $stderr) = $f->resolve(); + + if ($err === 2) { + throw new Exception("cpplint failed to run correctly:\n".$stderr); + } + + $lines = explode("\n", $stderr); + $messages = array(); + foreach ($lines as $line) { + $line = trim($line); + $matches = null; + $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/'; + if (!preg_match($regex, $line, $matches)) { + continue; + } + foreach ($matches as $key => $match) { + $matches[$key] = trim($match); + } + $message = new ArcanistLintMessage(); + $message->setPath($path); + $message->setLine($matches[1]); + $message->setCode($matches[3]); + $message->setName($matches[3]); + $message->setDescription($matches[2]); + $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING); + $this->addLintMessage($message); + } + } + + // The path of this linter + private function linterDir() { + return dirname(__FILE__); + } + + // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir. + private function rocksdbDir() { + return $this->linterDir()."/../.."; + } +} diff --git a/linters/cpp_linter/FbcodeCppLinter.php b/linters/cpp_linter/FbcodeCppLinter.php new file mode 100644 index 00000000..e62d3bbe --- /dev/null +++ b/linters/cpp_linter/FbcodeCppLinter.php @@ -0,0 +1,99 @@ +getEngine()->getFilePathOnDisk($p); + $lpath_file = file($lpath); + if (preg_match('/\.(c)$/', $lpath) || + preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) || + preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0]) + ) { + $futures[$p] = new ExecFuture("%s %s %s 2>&1", + $CPP_LINT, self::C_FLAG, + $this->getEngine()->getFilePathOnDisk($p)); + } else { + $futures[$p] = new ExecFuture("%s %s 2>&1", + self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p)); + } + } + + foreach (Futures($futures)->limit(8) as $p => $f) { + $this->rawLintOutput[$p] = $f->resolvex(); + } + } + return; + } + + public function getLinterName() { + return "FBCPP"; + } + + public function lintPath($path) { + $msgs = $this->getCppLintOutput($path); + foreach ($msgs as $m) { + $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']); + } + } + + public function getLintSeverityMap() { + return array( + self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING, + self::LINT_ERROR => ArcanistLintSeverity::SEVERITY_ERROR + ); + } + + public function getLintNameMap() { + return array( + self::LINT_WARNING => "CppLint Warning", + self::LINT_ERROR => "CppLint Error" + ); + } + + private function getCppLintOutput($path) { + list($output) = $this->rawLintOutput[$path]; + + $msgs = array(); + $current = null; + foreach (explode("\n", $output) as $line) { + if (preg_match('/[^:]*\((\d+)\):(.*)$/', $line, $matches)) { + if ($current) { + $msgs[] = $current; + } + $line = $matches[1]; + $text = $matches[2]; + $sev = preg_match('/.*Warning.*/', $text) + ? self::LINT_WARNING + : self::LINT_ERROR; + $current = array('line' => $line, + 'msg' => $text, + 'severity' => $sev); + } else if ($current) { + $current['msg'] .= ' ' . $line; + } + } + if ($current) { + $msgs[] = $current; + } + + return $msgs; + } +} + diff --git a/linters/cpp_linter/PfffCppLinter.php b/linters/cpp_linter/PfffCppLinter.php new file mode 100644 index 00000000..67366143 --- /dev/null +++ b/linters/cpp_linter/PfffCppLinter.php @@ -0,0 +1,68 @@ +&1", + $program, $this->getEngine()->getFilePathOnDisk($p)); + } + foreach (Futures($futures)->limit(8) as $p => $f) { + + list($stdout, $stderr) = $f->resolvex(); + $raw = json_decode($stdout, true); + if (!is_array($raw)) { + throw new Exception( + "checkCpp returned invalid JSON!". + "Stdout: {$stdout} Stderr: {$stderr}" + ); + } + foreach($raw as $err) { + $this->addLintMessage( + ArcanistLintMessage::newFromDictionary( + array( + 'path' => $err['file'], + 'line' => $err['line'], + 'char' => 0, + 'name' => $err['name'], + 'description' => $err['info'], + 'code' => $this->getLinterName(), + 'severity' => ArcanistLintSeverity::SEVERITY_WARNING, + ) + ) + ); + } + } + } + return; + } + + public function lintPath($path) { + return; + } +} diff --git a/linters/cpp_linter/cpplint.py b/linters/cpp_linter/cpplint.py new file mode 100755 index 00000000..d264b00d --- /dev/null +++ b/linters/cpp_linter/cpplint.py @@ -0,0 +1,4767 @@ +#!/usr/bin/python +# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. +# Copyright (c) 2011 The LevelDB Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. See the AUTHORS file for names of contributors. +# +# Copyright (c) 2009 Google Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Does google-lint on c++ files. + +The goal of this script is to identify places in the code that *may* +be in non-compliance with google style. It does not attempt to fix +up these problems -- the point is to educate. It does also not +attempt to find all problems, or to ensure that everything it does +find is legitimately a problem. + +In particular, we can get very confused by /* and // inside strings! +We do a small hack, which is to ignore //'s with "'s after them on the +same line, but it is far from perfect (in either direction). +""" + +import codecs +import copy +import getopt +import math # for log +import os +import re +import sre_compile +import string +import sys +import unicodedata + + +_USAGE = """ +Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] + [--counting=total|toplevel|detailed] [--root=subdir] + [--linelength=digits] + [file] ... + + The style guidelines this tries to follow are those in + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml + + Every problem is given a confidence score from 1-5, with 5 meaning we are + certain of the problem, and 1 meaning it could be a legitimate construct. + This will miss some errors, and is not a substitute for a code review. + + To suppress false-positive errors of a certain category, add a + 'NOLINT(category)' comment to the line. NOLINT or NOLINT(*) + suppresses errors of all categories on that line. + + The files passed in will be linted; at least one file must be provided. + Default linted extensions are .cc, .cpp, .cu, .cuh and .h. Change the + extensions with the --extensions flag. + + Flags: + + output=vs7 + By default, the output is formatted to ease emacs parsing. Visual Studio + compatible output (vs7) may also be used. Other formats are unsupported. + + verbose=# + Specify a number 0-5 to restrict errors to certain verbosity levels. + + filter=-x,+y,... + Specify a comma-separated list of category-filters to apply: only + error messages whose category names pass the filters will be printed. + (Category names are printed with the message and look like + "[whitespace/indent]".) Filters are evaluated left to right. + "-FOO" and "FOO" means "do not print categories that start with FOO". + "+FOO" means "do print categories that start with FOO". + + Examples: --filter=-whitespace,+whitespace/braces + --filter=whitespace,runtime/printf,+runtime/printf_format + --filter=-,+build/include_what_you_use + + To see a list of all the categories used in cpplint, pass no arg: + --filter= + + counting=total|toplevel|detailed + The total number of errors found is always printed. If + 'toplevel' is provided, then the count of errors in each of + the top-level categories like 'build' and 'whitespace' will + also be printed. If 'detailed' is provided, then a count + is provided for each category like 'build/class'. + + root=subdir + The root directory used for deriving header guard CPP variable. + By default, the header guard CPP variable is calculated as the relative + path to the directory that contains .git, .hg, or .svn. When this flag + is specified, the relative path is calculated from the specified + directory. If the specified directory does not exist, this flag is + ignored. + + Examples: + Assuing that src/.git exists, the header guard CPP variables for + src/chrome/browser/ui/browser.h are: + + No flag => CHROME_BROWSER_UI_BROWSER_H_ + --root=chrome => BROWSER_UI_BROWSER_H_ + --root=chrome/browser => UI_BROWSER_H_ + + linelength=digits + This is the allowed line length for the project. The default value is + 80 characters. + + Examples: + --linelength=120 + + extensions=extension,extension,... + The allowed file extensions that cpplint will check + + Examples: + --extensions=hpp,cpp +""" + +# We categorize each error message we print. Here are the categories. +# We want an explicit list so we can list them all in cpplint --filter=. +# If you add a new error message with a new category, add it to the list +# here! cpplint_unittest.py should tell you if you forget to do this. +_ERROR_CATEGORIES = [ + 'build/class', + 'build/deprecated', + 'build/endif_comment', + 'build/explicit_make_pair', + 'build/forward_decl', + 'build/header_guard', + 'build/include', + 'build/include_alpha', + 'build/include_order', + 'build/include_what_you_use', + 'build/namespaces', + 'build/printf_format', + 'build/storage_class', + 'legal/copyright', + 'readability/alt_tokens', + 'readability/braces', + 'readability/casting', + 'readability/check', + 'readability/constructors', + 'readability/fn_size', + 'readability/function', + 'readability/multiline_comment', + 'readability/multiline_string', + 'readability/namespace', + 'readability/nolint', + 'readability/nul', + 'readability/streams', + 'readability/todo', + 'readability/utf8', + 'runtime/arrays', + 'runtime/casting', + 'runtime/explicit', + 'runtime/int', + 'runtime/init', + 'runtime/invalid_increment', + 'runtime/member_string_references', + 'runtime/memset', + 'runtime/operator', + 'runtime/printf', + 'runtime/printf_format', + 'runtime/references', + 'runtime/string', + 'runtime/threadsafe_fn', + 'runtime/vlog', + 'whitespace/blank_line', + 'whitespace/braces', + 'whitespace/comma', + 'whitespace/comments', + 'whitespace/empty_conditional_body', + 'whitespace/empty_loop_body', + 'whitespace/end_of_line', + 'whitespace/ending_newline', + 'whitespace/forcolon', + 'whitespace/indent', + 'whitespace/line_length', + 'whitespace/newline', + 'whitespace/operators', + 'whitespace/parens', + 'whitespace/semicolon', + 'whitespace/tab', + 'whitespace/todo' + ] + +# The default state of the category filter. This is overrided by the --filter= +# flag. By default all errors are on, so only add here categories that should be +# off by default (i.e., categories that must be enabled by the --filter= flags). +# All entries here should start with a '-' or '+', as in the --filter= flag. +_DEFAULT_FILTERS = ['-build/include_alpha'] + +# We used to check for high-bit characters, but after much discussion we +# decided those were OK, as long as they were in UTF-8 and didn't represent +# hard-coded international strings, which belong in a separate i18n file. + + +# C++ headers +_CPP_HEADERS = frozenset([ + # Legacy + 'algobase.h', + 'algo.h', + 'alloc.h', + 'builtinbuf.h', + 'bvector.h', + 'complex.h', + 'defalloc.h', + 'deque.h', + 'editbuf.h', + 'fstream.h', + 'function.h', + 'hash_map', + 'hash_map.h', + 'hash_set', + 'hash_set.h', + 'hashtable.h', + 'heap.h', + 'indstream.h', + 'iomanip.h', + 'iostream.h', + 'istream.h', + 'iterator.h', + 'list.h', + 'map.h', + 'multimap.h', + 'multiset.h', + 'ostream.h', + 'pair.h', + 'parsestream.h', + 'pfstream.h', + 'procbuf.h', + 'pthread_alloc', + 'pthread_alloc.h', + 'rope', + 'rope.h', + 'ropeimpl.h', + 'set.h', + 'slist', + 'slist.h', + 'stack.h', + 'stdiostream.h', + 'stl_alloc.h', + 'stl_relops.h', + 'streambuf.h', + 'stream.h', + 'strfile.h', + 'strstream.h', + 'tempbuf.h', + 'tree.h', + 'type_traits.h', + 'vector.h', + # 17.6.1.2 C++ library headers + 'algorithm', + 'array', + 'atomic', + 'bitset', + 'chrono', + 'codecvt', + 'complex', + 'condition_variable', + 'deque', + 'exception', + 'forward_list', + 'fstream', + 'functional', + 'future', + 'initializer_list', + 'iomanip', + 'ios', + 'iosfwd', + 'iostream', + 'istream', + 'iterator', + 'limits', + 'list', + 'locale', + 'map', + 'memory', + 'mutex', + 'new', + 'numeric', + 'ostream', + 'queue', + 'random', + 'ratio', + 'regex', + 'set', + 'sstream', + 'stack', + 'stdexcept', + 'streambuf', + 'string', + 'strstream', + 'system_error', + 'thread', + 'tuple', + 'typeindex', + 'typeinfo', + 'type_traits', + 'unordered_map', + 'unordered_set', + 'utility', + 'valarray', + 'vector', + # 17.6.1.2 C++ headers for C library facilities + 'cassert', + 'ccomplex', + 'cctype', + 'cerrno', + 'cfenv', + 'cfloat', + 'cinttypes', + 'ciso646', + 'climits', + 'clocale', + 'cmath', + 'csetjmp', + 'csignal', + 'cstdalign', + 'cstdarg', + 'cstdbool', + 'cstddef', + 'cstdint', + 'cstdio', + 'cstdlib', + 'cstring', + 'ctgmath', + 'ctime', + 'cuchar', + 'cwchar', + 'cwctype', + ]) + +# Assertion macros. These are defined in base/logging.h and +# testing/base/gunit.h. Note that the _M versions need to come first +# for substring matching to work. +_CHECK_MACROS = [ + 'DCHECK', 'CHECK', + 'EXPECT_TRUE_M', 'EXPECT_TRUE', + 'ASSERT_TRUE_M', 'ASSERT_TRUE', + 'EXPECT_FALSE_M', 'EXPECT_FALSE', + 'ASSERT_FALSE_M', 'ASSERT_FALSE', + ] + +# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE +_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS]) + +for op, replacement in [('==', 'EQ'), ('!=', 'NE'), + ('>=', 'GE'), ('>', 'GT'), + ('<=', 'LE'), ('<', 'LT')]: + _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement + _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement + +for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), + ('>=', 'LT'), ('>', 'LE'), + ('<=', 'GT'), ('<', 'GE')]: + _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement + _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement + +# Alternative tokens and their replacements. For full list, see section 2.5 +# Alternative tokens [lex.digraph] in the C++ standard. +# +# Digraphs (such as '%:') are not included here since it's a mess to +# match those on a word boundary. +_ALT_TOKEN_REPLACEMENT = { + 'and': '&&', + 'bitor': '|', + 'or': '||', + 'xor': '^', + 'compl': '~', + 'bitand': '&', + 'and_eq': '&=', + 'or_eq': '|=', + 'xor_eq': '^=', + 'not': '!', + 'not_eq': '!=' + } + +# Compile regular expression that matches all the above keywords. The "[ =()]" +# bit is meant to avoid matching these keywords outside of boolean expressions. +# +# False positives include C-style multi-line comments and multi-line strings +# but those have always been troublesome for cpplint. +_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile( + r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') + + +# These constants define types of headers for use with +# _IncludeState.CheckNextIncludeOrder(). +_C_SYS_HEADER = 1 +_CPP_SYS_HEADER = 2 +_LIKELY_MY_HEADER = 3 +_POSSIBLE_MY_HEADER = 4 +_OTHER_HEADER = 5 + +# These constants define the current inline assembly state +_NO_ASM = 0 # Outside of inline assembly block +_INSIDE_ASM = 1 # Inside inline assembly block +_END_ASM = 2 # Last line of inline assembly block +_BLOCK_ASM = 3 # The whole block is an inline assembly block + +# Match start of assembly blocks +_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' + r'(?:\s+(volatile|__volatile__))?' + r'\s*[{(]') + + +_regexp_compile_cache = {} + +# Finds occurrences of NOLINT or NOLINT(...). +_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?') + +# {str, set(int)}: a map from error categories to sets of linenumbers +# on which those errors are expected and should be suppressed. +_error_suppressions = {} + +# The root directory used for deriving header guard CPP variable. +# This is set by --root flag. +_root = None + +# The allowed line length of files. +# This is set by --linelength flag. +_line_length = 80 + +# The allowed extensions for file names +# This is set by --extensions flag. +_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) + +def ParseNolintSuppressions(filename, raw_line, linenum, error): + """Updates the global list of error-suppressions. + + Parses any NOLINT comments on the current line, updating the global + error_suppressions store. Reports an error if the NOLINT comment + was malformed. + + Args: + filename: str, the name of the input file. + raw_line: str, the line of input text, with comments. + linenum: int, the number of the current line. + error: function, an error handler. + """ + # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*). + matched = _RE_SUPPRESSION.search(raw_line) + if matched: + category = matched.group(1) + if category in (None, '(*)'): # => "suppress all" + _error_suppressions.setdefault(None, set()).add(linenum) + else: + if category.startswith('(') and category.endswith(')'): + category = category[1:-1] + if category in _ERROR_CATEGORIES: + _error_suppressions.setdefault(category, set()).add(linenum) + else: + error(filename, linenum, 'readability/nolint', 5, + 'Unknown NOLINT error category: %s' % category) + + +def ResetNolintSuppressions(): + "Resets the set of NOLINT suppressions to empty." + _error_suppressions.clear() + + +def IsErrorSuppressedByNolint(category, linenum): + """Returns true if the specified error category is suppressed on this line. + + Consults the global error_suppressions map populated by + ParseNolintSuppressions/ResetNolintSuppressions. + + Args: + category: str, the category of the error. + linenum: int, the current line number. + Returns: + bool, True iff the error should be suppressed due to a NOLINT comment. + """ + return (linenum in _error_suppressions.get(category, set()) or + linenum in _error_suppressions.get(None, set())) + +def Match(pattern, s): + """Matches the string with the pattern, caching the compiled regexp.""" + # The regexp compilation caching is inlined in both Match and Search for + # performance reasons; factoring it out into a separate function turns out + # to be noticeably expensive. + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].match(s) + + +def ReplaceAll(pattern, rep, s): + """Replaces instances of pattern in a string with a replacement. + + The compiled regex is kept in a cache shared by Match and Search. + + Args: + pattern: regex pattern + rep: replacement text + s: search string + + Returns: + string with replacements made (or original string if no replacements) + """ + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].sub(rep, s) + + +def Search(pattern, s): + """Searches the string for the pattern, caching the compiled regexp.""" + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].search(s) + + +class _IncludeState(dict): + """Tracks line numbers for includes, and the order in which includes appear. + + As a dict, an _IncludeState object serves as a mapping between include + filename and line number on which that file was included. + + Call CheckNextIncludeOrder() once for each header in the file, passing + in the type constants defined above. Calls in an illegal order will + raise an _IncludeError with an appropriate error message. + + """ + # self._section will move monotonically through this set. If it ever + # needs to move backwards, CheckNextIncludeOrder will raise an error. + _INITIAL_SECTION = 0 + _MY_H_SECTION = 1 + _C_SECTION = 2 + _CPP_SECTION = 3 + _OTHER_H_SECTION = 4 + + _TYPE_NAMES = { + _C_SYS_HEADER: 'C system header', + _CPP_SYS_HEADER: 'C++ system header', + _LIKELY_MY_HEADER: 'header this file implements', + _POSSIBLE_MY_HEADER: 'header this file may implement', + _OTHER_HEADER: 'other header', + } + _SECTION_NAMES = { + _INITIAL_SECTION: "... nothing. (This can't be an error.)", + _MY_H_SECTION: 'a header this file implements', + _C_SECTION: 'C system header', + _CPP_SECTION: 'C++ system header', + _OTHER_H_SECTION: 'other header', + } + + def __init__(self): + dict.__init__(self) + self.ResetSection() + + def ResetSection(self): + # The name of the current section. + self._section = self._INITIAL_SECTION + # The path of last found header. + self._last_header = '' + + def SetLastHeader(self, header_path): + self._last_header = header_path + + def CanonicalizeAlphabeticalOrder(self, header_path): + """Returns a path canonicalized for alphabetical comparison. + + - replaces "-" with "_" so they both cmp the same. + - removes '-inl' since we don't require them to be after the main header. + - lowercase everything, just in case. + + Args: + header_path: Path to be canonicalized. + + Returns: + Canonicalized path. + """ + return header_path.replace('-inl.h', '.h').replace('-', '_').lower() + + def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): + """Check if a header is in alphabetical order with the previous header. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + header_path: Canonicalized header to be checked. + + Returns: + Returns true if the header is in alphabetical order. + """ + # If previous section is different from current section, _last_header will + # be reset to empty string, so it's always less than current header. + # + # If previous line was a blank line, assume that the headers are + # intentionally sorted the way they are. + if (self._last_header > header_path and + not Match(r'^\s*$', clean_lines.elided[linenum - 1])): + return False + return True + + def CheckNextIncludeOrder(self, header_type): + """Returns a non-empty error message if the next header is out of order. + + This function also updates the internal state to be ready to check + the next include. + + Args: + header_type: One of the _XXX_HEADER constants defined above. + + Returns: + The empty string if the header is in the right order, or an + error message describing what's wrong. + + """ + error_message = ('Found %s after %s' % + (self._TYPE_NAMES[header_type], + self._SECTION_NAMES[self._section])) + + last_section = self._section + + if header_type == _C_SYS_HEADER: + if self._section <= self._C_SECTION: + self._section = self._C_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _CPP_SYS_HEADER: + if self._section <= self._CPP_SECTION: + self._section = self._CPP_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _LIKELY_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + self._section = self._OTHER_H_SECTION + elif header_type == _POSSIBLE_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + # This will always be the fallback because we're not sure + # enough that the header is associated with this file. + self._section = self._OTHER_H_SECTION + else: + assert header_type == _OTHER_HEADER + self._section = self._OTHER_H_SECTION + + if last_section != self._section: + self._last_header = '' + + return '' + + +class _CppLintState(object): + """Maintains module-wide state..""" + + def __init__(self): + self.verbose_level = 1 # global setting. + self.error_count = 0 # global count of reported errors + # filters to apply when emitting error messages + self.filters = _DEFAULT_FILTERS[:] + self.counting = 'total' # In what way are we counting errors? + self.errors_by_category = {} # string to int dict storing error counts + + # output format: + # "emacs" - format that emacs can parse (default) + # "vs7" - format that Microsoft Visual Studio 7 can parse + self.output_format = 'emacs' + + def SetOutputFormat(self, output_format): + """Sets the output format for errors.""" + self.output_format = output_format + + def SetVerboseLevel(self, level): + """Sets the module's verbosity, and returns the previous setting.""" + last_verbose_level = self.verbose_level + self.verbose_level = level + return last_verbose_level + + def SetCountingStyle(self, counting_style): + """Sets the module's counting options.""" + self.counting = counting_style + + def SetFilters(self, filters): + """Sets the error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "+whitespace/indent"). + Each filter should start with + or -; else we die. + + Raises: + ValueError: The comma-separated filters did not all start with '+' or '-'. + E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter" + """ + # Default filters always have less priority than the flag ones. + self.filters = _DEFAULT_FILTERS[:] + for filt in filters.split(','): + clean_filt = filt.strip() + if clean_filt: + self.filters.append(clean_filt) + for filt in self.filters: + if not (filt.startswith('+') or filt.startswith('-')): + raise ValueError('Every filter in --filters must start with + or -' + ' (%s does not)' % filt) + + def ResetErrorCounts(self): + """Sets the module's error statistic back to zero.""" + self.error_count = 0 + self.errors_by_category = {} + + def IncrementErrorCount(self, category): + """Bumps the module's error statistic.""" + self.error_count += 1 + if self.counting in ('toplevel', 'detailed'): + if self.counting != 'detailed': + category = category.split('/')[0] + if category not in self.errors_by_category: + self.errors_by_category[category] = 0 + self.errors_by_category[category] += 1 + + def PrintErrorCounts(self): + """Print a summary of errors by category, and the total.""" + for category, count in self.errors_by_category.iteritems(): + sys.stderr.write('Category \'%s\' errors found: %d\n' % + (category, count)) + sys.stderr.write('Total errors found: %d\n' % self.error_count) + +_cpplint_state = _CppLintState() + + +def _OutputFormat(): + """Gets the module's output format.""" + return _cpplint_state.output_format + + +def _SetOutputFormat(output_format): + """Sets the module's output format.""" + _cpplint_state.SetOutputFormat(output_format) + + +def _VerboseLevel(): + """Returns the module's verbosity setting.""" + return _cpplint_state.verbose_level + + +def _SetVerboseLevel(level): + """Sets the module's verbosity, and returns the previous setting.""" + return _cpplint_state.SetVerboseLevel(level) + + +def _SetCountingStyle(level): + """Sets the module's counting options.""" + _cpplint_state.SetCountingStyle(level) + + +def _Filters(): + """Returns the module's list of output filters, as a list.""" + return _cpplint_state.filters + + +def _SetFilters(filters): + """Sets the module's error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.SetFilters(filters) + + +class _FunctionState(object): + """Tracks current function name and the number of lines in its body.""" + + _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. + _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. + + def __init__(self): + self.in_a_function = False + self.lines_in_function = 0 + self.current_function = '' + + def Begin(self, function_name): + """Start analyzing function body. + + Args: + function_name: The name of the function being tracked. + """ + self.in_a_function = True + self.lines_in_function = 0 + self.current_function = function_name + + def Count(self): + """Count line in current function body.""" + if self.in_a_function: + self.lines_in_function += 1 + + def Check(self, error, filename, linenum): + """Report if too many lines in function body. + + Args: + error: The function to call with any errors found. + filename: The name of the current file. + linenum: The number of the line to check. + """ + if Match(r'T(EST|est)', self.current_function): + base_trigger = self._TEST_TRIGGER + else: + base_trigger = self._NORMAL_TRIGGER + trigger = base_trigger * 2**_VerboseLevel() + + if self.lines_in_function > trigger: + error_level = int(math.log(self.lines_in_function / base_trigger, 2)) + # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... + if error_level > 5: + error_level = 5 + error(filename, linenum, 'readability/fn_size', error_level, + 'Small and focused functions are preferred:' + ' %s has %d non-comment lines' + ' (error triggered by exceeding %d lines).' % ( + self.current_function, self.lines_in_function, trigger)) + + def End(self): + """Stop analyzing function body.""" + self.in_a_function = False + + +class _IncludeError(Exception): + """Indicates a problem with the include order in a file.""" + pass + + +class FileInfo: + """Provides utility functions for filenames. + + FileInfo provides easy access to the components of a file's path + relative to the project root. + """ + + def __init__(self, filename): + self._filename = filename + + def FullName(self): + """Make Windows paths like Unix.""" + return os.path.abspath(self._filename).replace('\\', '/') + + def RepositoryName(self): + """FullName after removing the local path to the repository. + + If we have a real absolute path name here we can try to do something smart: + detecting the root of the checkout and truncating /path/to/checkout from + the name so that we get header guards that don't include things like + "C:\Documents and Settings\..." or "/home/username/..." in them and thus + people on different computers who have checked the source out to different + locations won't see bogus errors. + """ + fullname = self.FullName() + + if os.path.exists(fullname): + project_dir = os.path.dirname(fullname) + + if os.path.exists(os.path.join(project_dir, ".svn")): + # If there's a .svn file in the current directory, we recursively look + # up the directory tree for the top of the SVN checkout + root_dir = project_dir + one_up_dir = os.path.dirname(root_dir) + while os.path.exists(os.path.join(one_up_dir, ".svn")): + root_dir = os.path.dirname(root_dir) + one_up_dir = os.path.dirname(one_up_dir) + + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by + # searching up from the current path. + root_dir = os.path.dirname(fullname) + while (root_dir != os.path.dirname(root_dir) and + not os.path.exists(os.path.join(root_dir, ".git")) and + not os.path.exists(os.path.join(root_dir, ".hg")) and + not os.path.exists(os.path.join(root_dir, ".svn"))): + root_dir = os.path.dirname(root_dir) + + if (os.path.exists(os.path.join(root_dir, ".git")) or + os.path.exists(os.path.join(root_dir, ".hg")) or + os.path.exists(os.path.join(root_dir, ".svn"))): + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Don't know what to do; header guard warnings may be wrong... + return fullname + + def Split(self): + """Splits the file into the directory, basename, and extension. + + For 'chrome/browser/browser.cc', Split() would + return ('chrome/browser', 'browser', '.cc') + + Returns: + A tuple of (directory, basename, extension). + """ + + googlename = self.RepositoryName() + project, rest = os.path.split(googlename) + return (project,) + os.path.splitext(rest) + + def BaseName(self): + """File base name - text after the final slash, before the final period.""" + return self.Split()[1] + + def Extension(self): + """File extension - text following the final period.""" + return self.Split()[2] + + def NoExtension(self): + """File has no source file extension.""" + return '/'.join(self.Split()[0:2]) + + def IsSource(self): + """File has a source file extension.""" + return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + + +def _ShouldPrintError(category, confidence, linenum): + """If confidence >= verbose, category passes filter and is not suppressed.""" + + # There are three ways we might decide not to print an error message: + # a "NOLINT(category)" comment appears in the source, + # the verbosity level isn't high enough, or the filters filter it out. + if IsErrorSuppressedByNolint(category, linenum): + return False + if confidence < _cpplint_state.verbose_level: + return False + + is_filtered = False + for one_filter in _Filters(): + if one_filter.startswith('-'): + if category.startswith(one_filter[1:]): + is_filtered = True + elif one_filter.startswith('+'): + if category.startswith(one_filter[1:]): + is_filtered = False + else: + assert False # should have been checked for in SetFilter. + if is_filtered: + return False + + return True + + +def Error(filename, linenum, category, confidence, message): + """Logs the fact we've found a lint error. + + We log where the error was found, and also our confidence in the error, + that is, how certain we are this is a legitimate style regression, and + not a misidentification or a use that's sometimes justified. + + False positives can be suppressed by the use of + "cpplint(category)" comments on the offending line. These are + parsed into _error_suppressions. + + Args: + filename: The name of the file containing the error. + linenum: The number of the line containing the error. + category: A string used to describe the "category" this bug + falls under: "whitespace", say, or "runtime". Categories + may have a hierarchy separated by slashes: "whitespace/indent". + confidence: A number from 1-5 representing a confidence score for + the error, with 5 meaning that we are certain of the problem, + and 1 meaning that it could be a legitimate construct. + message: The error message. + """ + if _ShouldPrintError(category, confidence, linenum): + _cpplint_state.IncrementErrorCount(category) + if _cpplint_state.output_format == 'vs7': + sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + elif _cpplint_state.output_format == 'eclipse': + sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + else: + sys.stderr.write('%s:%s: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + + +# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. +_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( + r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') +# Matches strings. Escape codes should already be removed by ESCAPES. +_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"') +# Matches characters. Escape codes should already be removed by ESCAPES. +_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'") +# Matches multi-line C++ comments. +# This RE is a little bit more complicated than one might expect, because we +# have to take care of space removals tools so we can handle comments inside +# statements better. +# The current rule is: We only clear spaces from both sides when we're at the +# end of the line. Otherwise, we try to remove spaces from the right side, +# if this doesn't work we try on left side but only if there's a non-character +# on the right. +_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( + r"""(\s*/\*.*\*/\s*$| + /\*.*\*/\s+| + \s+/\*.*\*/(?=\W)| + /\*.*\*/)""", re.VERBOSE) + + +def IsCppString(line): + """Does line terminate so, that the next symbol is in string constant. + + This function does not consider single-line nor multi-line comments. + + Args: + line: is a partial line of code starting from the 0..n. + + Returns: + True, if next character appended to 'line' is inside a + string constant. + """ + + line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" + return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 + + +def CleanseRawStrings(raw_lines): + """Removes C++11 raw strings from lines. + + Before: + static const char kData[] = R"( + multi-line string + )"; + + After: + static const char kData[] = "" + (replaced by blank line) + ""; + + Args: + raw_lines: list of raw lines. + + Returns: + list of lines with C++11 raw strings replaced by empty strings. + """ + + delimiter = None + lines_without_raw_strings = [] + for line in raw_lines: + if delimiter: + # Inside a raw string, look for the end + end = line.find(delimiter) + if end >= 0: + # Found the end of the string, match leading space for this + # line and resume copying the original lines, and also insert + # a "" on the last line. + leading_space = Match(r'^(\s*)\S', line) + line = leading_space.group(1) + '""' + line[end + len(delimiter):] + delimiter = None + else: + # Haven't found the end yet, append a blank line. + line = '' + + else: + # Look for beginning of a raw string. + # See 2.14.15 [lex.string] for syntax. + matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) + if matched: + delimiter = ')' + matched.group(2) + '"' + + end = matched.group(3).find(delimiter) + if end >= 0: + # Raw string ended on same line + line = (matched.group(1) + '""' + + matched.group(3)[end + len(delimiter):]) + delimiter = None + else: + # Start of a multi-line raw string + line = matched.group(1) + '""' + + lines_without_raw_strings.append(line) + + # TODO(unknown): if delimiter is not None here, we might want to + # emit a warning for unterminated string. + return lines_without_raw_strings + + +def FindNextMultiLineCommentStart(lines, lineix): + """Find the beginning marker for a multiline comment.""" + while lineix < len(lines): + if lines[lineix].strip().startswith('/*'): + # Only return this marker if the comment goes beyond this line + if lines[lineix].strip().find('*/', 2) < 0: + return lineix + lineix += 1 + return len(lines) + + +def FindNextMultiLineCommentEnd(lines, lineix): + """We are inside a comment, find the end marker.""" + while lineix < len(lines): + if lines[lineix].strip().endswith('*/'): + return lineix + lineix += 1 + return len(lines) + + +def RemoveMultiLineCommentsFromRange(lines, begin, end): + """Clears a range of lines for multi-line comments.""" + # Having // dummy comments makes the lines non-empty, so we will not get + # unnecessary blank line warnings later in the code. + for i in range(begin, end): + lines[i] = '// dummy' + + +def RemoveMultiLineComments(filename, lines, error): + """Removes multiline (c-style) comments from lines.""" + lineix = 0 + while lineix < len(lines): + lineix_begin = FindNextMultiLineCommentStart(lines, lineix) + if lineix_begin >= len(lines): + return + lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) + if lineix_end >= len(lines): + error(filename, lineix_begin + 1, 'readability/multiline_comment', 5, + 'Could not find end of multi-line comment') + return + RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) + lineix = lineix_end + 1 + + +def CleanseComments(line): + """Removes //-comments and single-line C-style /* */ comments. + + Args: + line: A line of C++ source. + + Returns: + The line with single-line comments removed. + """ + commentpos = line.find('//') + if commentpos != -1 and not IsCppString(line[:commentpos]): + line = line[:commentpos].rstrip() + # get rid of /* ... */ + return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) + + +class CleansedLines(object): + """Holds 3 copies of all lines with different preprocessing applied to them. + + 1) elided member contains lines without strings and comments, + 2) lines member contains lines without comments, and + 3) raw_lines member contains all the lines without processing. + All these three members are of , and of the same length. + """ + + def __init__(self, lines): + self.elided = [] + self.lines = [] + self.raw_lines = lines + self.num_lines = len(lines) + self.lines_without_raw_strings = CleanseRawStrings(lines) + for linenum in range(len(self.lines_without_raw_strings)): + self.lines.append(CleanseComments( + self.lines_without_raw_strings[linenum])) + elided = self._CollapseStrings(self.lines_without_raw_strings[linenum]) + self.elided.append(CleanseComments(elided)) + + def NumLines(self): + """Returns the number of lines represented.""" + return self.num_lines + + @staticmethod + def _CollapseStrings(elided): + """Collapses strings and chars on a line to simple "" or '' blocks. + + We nix strings first so we're not fooled by text like '"http://"' + + Args: + elided: The line being processed. + + Returns: + The line with collapsed strings. + """ + if not _RE_PATTERN_INCLUDE.match(elided): + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided) + elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided) + return elided + + +def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar): + """Find the position just after the matching endchar. + + Args: + line: a CleansedLines line. + startpos: start searching at this position. + depth: nesting level at startpos. + startchar: expression opening character. + endchar: expression closing character. + + Returns: + On finding matching endchar: (index just after matching endchar, 0) + Otherwise: (-1, new depth at end of this line) + """ + for i in xrange(startpos, len(line)): + if line[i] == startchar: + depth += 1 + elif line[i] == endchar: + depth -= 1 + if depth == 0: + return (i + 1, 0) + return (-1, depth) + + +def CloseExpression(clean_lines, linenum, pos): + """If input points to ( or { or [ or <, finds the position that closes it. + + If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the + linenum/pos that correspond to the closing of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *past* the closing brace, or + (line, len(lines), -1) if we never find a close. Note we ignore + strings and comments when matching; and the line we return is the + 'cleansed' line at linenum. + """ + + line = clean_lines.elided[linenum] + startchar = line[pos] + if startchar not in '({[<': + return (line, clean_lines.NumLines(), -1) + if startchar == '(': endchar = ')' + if startchar == '[': endchar = ']' + if startchar == '{': endchar = '}' + if startchar == '<': endchar = '>' + + # Check first line + (end_pos, num_open) = FindEndOfExpressionInLine( + line, pos, 0, startchar, endchar) + if end_pos > -1: + return (line, linenum, end_pos) + + # Continue scanning forward + while linenum < clean_lines.NumLines() - 1: + linenum += 1 + line = clean_lines.elided[linenum] + (end_pos, num_open) = FindEndOfExpressionInLine( + line, 0, num_open, startchar, endchar) + if end_pos > -1: + return (line, linenum, end_pos) + + # Did not find endchar before end of file, give up + return (line, clean_lines.NumLines(), -1) + + +def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): + """Find position at the matching startchar. + + This is almost the reverse of FindEndOfExpressionInLine, but note + that the input position and returned position differs by 1. + + Args: + line: a CleansedLines line. + endpos: start searching at this position. + depth: nesting level at endpos. + startchar: expression opening character. + endchar: expression closing character. + + Returns: + On finding matching startchar: (index at matching startchar, 0) + Otherwise: (-1, new depth at beginning of this line) + """ + for i in xrange(endpos, -1, -1): + if line[i] == endchar: + depth += 1 + elif line[i] == startchar: + depth -= 1 + if depth == 0: + return (i, 0) + return (-1, depth) + + +def ReverseCloseExpression(clean_lines, linenum, pos): + """If input points to ) or } or ] or >, finds the position that opens it. + + If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the + linenum/pos that correspond to the opening of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *at* the opening brace, or + (line, 0, -1) if we never find the matching opening brace. Note + we ignore strings and comments when matching; and the line we + return is the 'cleansed' line at linenum. + """ + line = clean_lines.elided[linenum] + endchar = line[pos] + if endchar not in ')}]>': + return (line, 0, -1) + if endchar == ')': startchar = '(' + if endchar == ']': startchar = '[' + if endchar == '}': startchar = '{' + if endchar == '>': startchar = '<' + + # Check last line + (start_pos, num_open) = FindStartOfExpressionInLine( + line, pos, 0, startchar, endchar) + if start_pos > -1: + return (line, linenum, start_pos) + + # Continue scanning backward + while linenum > 0: + linenum -= 1 + line = clean_lines.elided[linenum] + (start_pos, num_open) = FindStartOfExpressionInLine( + line, len(line) - 1, num_open, startchar, endchar) + if start_pos > -1: + return (line, linenum, start_pos) + + # Did not find startchar before beginning of file, give up + return (line, 0, -1) + + +def CheckForCopyright(filename, lines, error): + """Logs an error if no Copyright message appears at the top of the file.""" + + # We'll say it should occur by line 10. Don't forget there's a + # dummy line at the front. + for line in xrange(1, min(len(lines), 11)): + if re.search(r'Copyright', lines[line], re.I): break + else: # means no copyright line was found + error(filename, 0, 'legal/copyright', 5, + 'No copyright message found. ' + 'You should have a line: "Copyright [year] "') + + +def GetHeaderGuardCPPVariable(filename): + """Returns the CPP variable that should be used as a header guard. + + Args: + filename: The name of a C++ header file. + + Returns: + The CPP variable that should be used as a header guard in the + named file. + + """ + + # Restores original filename in case that cpplint is invoked from Emacs's + # flymake. + filename = re.sub(r'_flymake\.h$', '.h', filename) + filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) + + fileinfo = FileInfo(filename) + file_path_from_root = fileinfo.RepositoryName() + if _root: + file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root) + return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' + + +def CheckForHeaderGuard(filename, lines, error): + """Checks that the file contains a header guard. + + Logs an error if no #ifndef header guard is present. For other + headers, checks that the full pathname is used. + + Args: + filename: The name of the C++ header file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + cppvar = GetHeaderGuardCPPVariable(filename) + + ifndef = None + ifndef_linenum = 0 + define = None + endif = None + endif_linenum = 0 + for linenum, line in enumerate(lines): + # Already been well guarded, no need for further checking. + if line.strip() == "#pragma once": + return + linesplit = line.split() + if len(linesplit) >= 2: + # find the first occurrence of #ifndef and #define, save arg + if not ifndef and linesplit[0] == '#ifndef': + # set ifndef to the header guard presented on the #ifndef line. + ifndef = linesplit[1] + ifndef_linenum = linenum + if not define and linesplit[0] == '#define': + define = linesplit[1] + # find the last occurrence of #endif, save entire line + if line.startswith('#endif'): + endif = line + endif_linenum = linenum + + if not ifndef: + error(filename, 0, 'build/header_guard', 5, + 'No #ifndef header guard found, suggested CPP variable is: %s' % + cppvar) + return + + if not define: + error(filename, 0, 'build/header_guard', 5, + 'No #define header guard found, suggested CPP variable is: %s' % + cppvar) + return + + # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ + # for backward compatibility. + if ifndef != cppvar: + error_level = 0 + if ifndef != cppvar + '_': + error_level = 5 + + ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum, + error) + error(filename, ifndef_linenum, 'build/header_guard', error_level, + '#ifndef header guard has wrong style, please use: %s' % cppvar) + + if define != ifndef: + error(filename, 0, 'build/header_guard', 5, + '#ifndef and #define don\'t match, suggested CPP variable is: %s' % + cppvar) + return + + if endif != ('#endif // %s' % cppvar): + error_level = 0 + if endif != ('#endif // %s' % (cppvar + '_')): + error_level = 5 + + ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum, + error) + error(filename, endif_linenum, 'build/header_guard', error_level, + '#endif line should be "#endif // %s"' % cppvar) + + +def CheckForBadCharacters(filename, lines, error): + """Logs an error for each line containing bad characters. + + Two kinds of bad characters: + + 1. Unicode replacement characters: These indicate that either the file + contained invalid UTF-8 (likely) or Unicode replacement characters (which + it shouldn't). Note that it's possible for this to throw off line + numbering if the invalid UTF-8 occurred adjacent to a newline. + + 2. NUL bytes. These are problematic for some tools. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + for linenum, line in enumerate(lines): + if u'\ufffd' in line: + error(filename, linenum, 'readability/utf8', 5, + 'Line contains invalid UTF-8 (or Unicode replacement character).') + if '\0' in line: + error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.') + + +def CheckForNewlineAtEOF(filename, lines, error): + """Logs an error if there is no newline char at the end of the file. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + # The array lines() was created by adding two newlines to the + # original file (go figure), then splitting on \n. + # To verify that the file ends in \n, we just have to make sure the + # last-but-two element of lines() exists and is empty. + if len(lines) < 3 or lines[-2]: + error(filename, len(lines) - 2, 'whitespace/ending_newline', 5, + 'Could not find a newline character at the end of the file.') + + +def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): + """Logs an error if we see /* ... */ or "..." that extend past one line. + + /* ... */ comments are legit inside macros, for one line. + Otherwise, we prefer // comments, so it's ok to warn about the + other. Likewise, it's ok for strings to extend across multiple + lines, as long as a line continuation character (backslash) + terminates each line. Although not currently prohibited by the C++ + style guide, it's ugly and unnecessary. We don't do well with either + in this lint program, so we warn about both. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Remove all \\ (escaped backslashes) from the line. They are OK, and the + # second (escaped) slash may trigger later \" detection erroneously. + line = line.replace('\\\\', '') + + if line.count('/*') > line.count('*/'): + error(filename, linenum, 'readability/multiline_comment', 5, + 'Complex multi-line /*...*/-style comment found. ' + 'Lint may give bogus warnings. ' + 'Consider replacing these with //-style comments, ' + 'with #if 0...#endif, ' + 'or with more clearly structured multi-line comments.') + + if (line.count('"') - line.count('\\"')) % 2: + error(filename, linenum, 'readability/multiline_string', 5, + 'Multi-line string ("...") found. This lint script doesn\'t ' + 'do well with such strings, and may give bogus warnings. ' + 'Use C++11 raw strings or concatenation instead.') + + +threading_list = ( + ('asctime(', 'asctime_r('), + ('ctime(', 'ctime_r('), + ('getgrgid(', 'getgrgid_r('), + ('getgrnam(', 'getgrnam_r('), + ('getlogin(', 'getlogin_r('), + ('getpwnam(', 'getpwnam_r('), + ('getpwuid(', 'getpwuid_r('), + ('gmtime(', 'gmtime_r('), + ('localtime(', 'localtime_r('), + ('rand(', 'rand_r('), + ('strtok(', 'strtok_r('), + ('ttyname(', 'ttyname_r('), + ) + + +def CheckPosixThreading(filename, clean_lines, linenum, error): + """Checks for calls to thread-unsafe functions. + + Much code has been originally written without consideration of + multi-threading. Also, engineers are relying on their old experience; + they have learned posix before threading extensions were added. These + tests guide the engineers to use thread-safe functions (when using + posix directly). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + for single_thread_function, multithread_safe_function in threading_list: + ix = line.find(single_thread_function) + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and + line[ix - 1] not in ('_', '.', '>'))): + error(filename, linenum, 'runtime/threadsafe_fn', 2, + 'Consider using ' + multithread_safe_function + + '...) instead of ' + single_thread_function + + '...) for improved thread safety.') + + +def CheckVlogArguments(filename, clean_lines, linenum, error): + """Checks that VLOG() is only used for defining a logging level. + + For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and + VLOG(FATAL) are not. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): + error(filename, linenum, 'runtime/vlog', 5, + 'VLOG() should be used with numeric verbosity level. ' + 'Use LOG() if you want symbolic severity levels.') + + +# Matches invalid increment: *count++, which moves pointer instead of +# incrementing a value. +_RE_PATTERN_INVALID_INCREMENT = re.compile( + r'^\s*\*\w+(\+\+|--);') + + +def CheckInvalidIncrement(filename, clean_lines, linenum, error): + """Checks for invalid increment *count++. + + For example following function: + void increment_counter(int* count) { + *count++; + } + is invalid, because it effectively does count++, moving pointer, and should + be replaced with ++*count, (*count)++ or *count += 1. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if _RE_PATTERN_INVALID_INCREMENT.match(line): + error(filename, linenum, 'runtime/invalid_increment', 5, + 'Changing pointer instead of value (or unused value of operator*).') + + +class _BlockInfo(object): + """Stores information about a generic block of code.""" + + def __init__(self, seen_open_brace): + self.seen_open_brace = seen_open_brace + self.open_parentheses = 0 + self.inline_asm = _NO_ASM + + def CheckBegin(self, filename, clean_lines, linenum, error): + """Run checks that applies to text up to the opening brace. + + This is mostly for checking the text after the class identifier + and the "{", usually where the base class is specified. For other + blocks, there isn't much to check, so we always pass. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Run checks that applies to text after the closing brace. + + This is mostly used for checking end of namespace comments. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + +class _ClassInfo(_BlockInfo): + """Stores information about a class.""" + + def __init__(self, name, class_or_struct, clean_lines, linenum): + _BlockInfo.__init__(self, False) + self.name = name + self.starting_linenum = linenum + self.is_derived = False + if class_or_struct == 'struct': + self.access = 'public' + self.is_struct = True + else: + self.access = 'private' + self.is_struct = False + + # Remember initial indentation level for this class. Using raw_lines here + # instead of elided to account for leading comments. + initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum]) + if initial_indent: + self.class_indent = len(initial_indent.group(1)) + else: + self.class_indent = 0 + + # Try to find the end of the class. This will be confused by things like: + # class A { + # } *x = { ... + # + # But it's still good enough for CheckSectionSpacing. + self.last_line = 0 + depth = 0 + for i in range(linenum, clean_lines.NumLines()): + line = clean_lines.elided[i] + depth += line.count('{') - line.count('}') + if not depth: + self.last_line = i + break + + def CheckBegin(self, filename, clean_lines, linenum, error): + # Look for a bare ':' + if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): + self.is_derived = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + # Check that closing brace is aligned with beginning of the class. + # Only do this if the closing brace is indented by only whitespaces. + # This means we will not check single-line class definitions. + indent = Match(r'^( *)\}', clean_lines.elided[linenum]) + if indent and len(indent.group(1)) != self.class_indent: + if self.is_struct: + parent = 'struct ' + self.name + else: + parent = 'class ' + self.name + error(filename, linenum, 'whitespace/indent', 3, + 'Closing brace should be aligned with beginning of %s' % parent) + + +class _NamespaceInfo(_BlockInfo): + """Stores information about a namespace.""" + + def __init__(self, name, linenum): + _BlockInfo.__init__(self, False) + self.name = name or '' + self.starting_linenum = linenum + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Check end of namespace comments.""" + line = clean_lines.raw_lines[linenum] + + # Check how many lines is enclosed in this namespace. Don't issue + # warning for missing namespace comments if there aren't enough + # lines. However, do apply checks if there is already an end of + # namespace comment and it's incorrect. + # + # TODO(unknown): We always want to check end of namespace comments + # if a namespace is large, but sometimes we also want to apply the + # check if a short namespace contained nontrivial things (something + # other than forward declarations). There is currently no logic on + # deciding what these nontrivial things are, so this check is + # triggered by namespace size only, which works most of the time. + if (linenum - self.starting_linenum < 10 + and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + return + + # Look for matching comment at end of namespace. + # + # Note that we accept C style "/* */" comments for terminating + # namespaces, so that code that terminate namespaces inside + # preprocessor macros can be cpplint clean. + # + # We also accept stuff like "// end of namespace ." with the + # period at the end. + # + # Besides these, we don't accept anything else, otherwise we might + # get false negatives when existing comment is a substring of the + # expected namespace. + if self.name: + # Named namespace + if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + + r'[\*/\.\\\s]*$'), + line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace %s"' % + self.name) + else: + # Anonymous namespace + if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace"') + + +class _PreprocessorInfo(object): + """Stores checkpoints of nesting stacks when #if/#else is seen.""" + + def __init__(self, stack_before_if): + # The entire nesting stack before #if + self.stack_before_if = stack_before_if + + # The entire nesting stack up to #else + self.stack_before_else = [] + + # Whether we have already seen #else or #elif + self.seen_else = False + + +class _NestingState(object): + """Holds states related to parsing braces.""" + + def __init__(self): + # Stack for tracking all braces. An object is pushed whenever we + # see a "{", and popped when we see a "}". Only 3 types of + # objects are possible: + # - _ClassInfo: a class or struct. + # - _NamespaceInfo: a namespace. + # - _BlockInfo: some other type of block. + self.stack = [] + + # Stack of _PreprocessorInfo objects. + self.pp_stack = [] + + def SeenOpenBrace(self): + """Check if we have seen the opening brace for the innermost block. + + Returns: + True if we have seen the opening brace, False if the innermost + block is still expecting an opening brace. + """ + return (not self.stack) or self.stack[-1].seen_open_brace + + def InNamespaceBody(self): + """Check if we are currently one level inside a namespace body. + + Returns: + True if top of the stack is a namespace block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + + def UpdatePreprocessor(self, line): + """Update preprocessor stack. + + We need to handle preprocessors due to classes like this: + #ifdef SWIG + struct ResultDetailsPageElementExtensionPoint { + #else + struct ResultDetailsPageElementExtensionPoint : public Extension { + #endif + + We make the following assumptions (good enough for most files): + - Preprocessor condition evaluates to true from #if up to first + #else/#elif/#endif. + + - Preprocessor condition evaluates to false from #else/#elif up + to #endif. We still perform lint checks on these lines, but + these do not affect nesting stack. + + Args: + line: current line to check. + """ + if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): + # Beginning of #if block, save the nesting stack here. The saved + # stack will allow us to restore the parsing state in the #else case. + self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) + elif Match(r'^\s*#\s*(else|elif)\b', line): + # Beginning of #else block + if self.pp_stack: + if not self.pp_stack[-1].seen_else: + # This is the first #else or #elif block. Remember the + # whole nesting stack up to this point. This is what we + # keep after the #endif. + self.pp_stack[-1].seen_else = True + self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack) + + # Restore the stack to how it was before the #if + self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) + else: + # TODO(unknown): unexpected #else, issue warning? + pass + elif Match(r'^\s*#\s*endif\b', line): + # End of #if or #else blocks. + if self.pp_stack: + # If we saw an #else, we will need to restore the nesting + # stack to its former state before the #else, otherwise we + # will just continue from where we left off. + if self.pp_stack[-1].seen_else: + # Here we can just use a shallow copy since we are the last + # reference to it. + self.stack = self.pp_stack[-1].stack_before_else + # Drop the corresponding #if + self.pp_stack.pop() + else: + # TODO(unknown): unexpected #endif, issue warning? + pass + + def Update(self, filename, clean_lines, linenum, error): + """Update nesting state with current line. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Update pp_stack first + self.UpdatePreprocessor(line) + + # Count parentheses. This is to avoid adding struct arguments to + # the nesting stack. + if self.stack: + inner_block = self.stack[-1] + depth_change = line.count('(') - line.count(')') + inner_block.open_parentheses += depth_change + + # Also check if we are starting or ending an inline assembly block. + if inner_block.inline_asm in (_NO_ASM, _END_ASM): + if (depth_change != 0 and + inner_block.open_parentheses == 1 and + _MATCH_ASM.match(line)): + # Enter assembly block + inner_block.inline_asm = _INSIDE_ASM + else: + # Not entering assembly block. If previous line was _END_ASM, + # we will now shift to _NO_ASM state. + inner_block.inline_asm = _NO_ASM + elif (inner_block.inline_asm == _INSIDE_ASM and + inner_block.open_parentheses == 0): + # Exit assembly block + inner_block.inline_asm = _END_ASM + + # Consume namespace declaration at the beginning of the line. Do + # this in a loop so that we catch same line declarations like this: + # namespace proto2 { namespace bridge { class MessageSet; } } + while True: + # Match start of namespace. The "\b\s*" below catches namespace + # declarations even if it weren't followed by a whitespace, this + # is so that we don't confuse our namespace checker. The + # missing spaces will be flagged by CheckSpacing. + namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line) + if not namespace_decl_match: + break + + new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum) + self.stack.append(new_namespace) + + line = namespace_decl_match.group(2) + if line.find('{') != -1: + new_namespace.seen_open_brace = True + line = line[line.find('{') + 1:] + + # Look for a class declaration in whatever is left of the line + # after parsing namespaces. The regexp accounts for decorated classes + # such as in: + # class LOCKABLE API Object { + # }; + # + # Templates with class arguments may confuse the parser, for example: + # template , + # class Vector = vector > + # class HeapQueue { + # + # Because this parser has no nesting state about templates, by the + # time it saw "class Comparator", it may think that it's a new class. + # Nested templates have a similar problem: + # template < + # typename ExportedType, + # typename TupleType, + # template class ImplTemplate> + # + # To avoid these cases, we ignore classes that are followed by '=' or '>' + class_decl_match = Match( + r'\s*(template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)' + r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line) + if (class_decl_match and + (not self.stack or self.stack[-1].open_parentheses == 0)): + self.stack.append(_ClassInfo( + class_decl_match.group(4), class_decl_match.group(2), + clean_lines, linenum)) + line = class_decl_match.group(5) + + # If we have not yet seen the opening brace for the innermost block, + # run checks here. + if not self.SeenOpenBrace(): + self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) + + # Update access control if we are inside a class/struct + if self.stack and isinstance(self.stack[-1], _ClassInfo): + classinfo = self.stack[-1] + access_match = Match( + r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' + r':(?:[^:]|$)', + line) + if access_match: + classinfo.access = access_match.group(2) + + # Check that access keywords are indented +1 space. Skip this + # check if the keywords are not preceded by whitespaces. + indent = access_match.group(1) + if (len(indent) != classinfo.class_indent + 1 and + Match(r'^\s*$', indent)): + if classinfo.is_struct: + parent = 'struct ' + classinfo.name + else: + parent = 'class ' + classinfo.name + slots = '' + if access_match.group(3): + slots = access_match.group(3) + error(filename, linenum, 'whitespace/indent', 3, + '%s%s: should be indented +1 space inside %s' % ( + access_match.group(2), slots, parent)) + + # Consume braces or semicolons from what's left of the line + while True: + # Match first brace, semicolon, or closed parenthesis. + matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) + if not matched: + break + + token = matched.group(1) + if token == '{': + # If namespace or class hasn't seen a opening brace yet, mark + # namespace/class head as complete. Push a new block onto the + # stack otherwise. + if not self.SeenOpenBrace(): + self.stack[-1].seen_open_brace = True + else: + self.stack.append(_BlockInfo(True)) + if _MATCH_ASM.match(line): + self.stack[-1].inline_asm = _BLOCK_ASM + elif token == ';' or token == ')': + # If we haven't seen an opening brace yet, but we already saw + # a semicolon, this is probably a forward declaration. Pop + # the stack for these. + # + # Similarly, if we haven't seen an opening brace yet, but we + # already saw a closing parenthesis, then these are probably + # function arguments with extra "class" or "struct" keywords. + # Also pop these stack for these. + if not self.SeenOpenBrace(): + self.stack.pop() + else: # token == '}' + # Perform end of block checks and pop the stack. + if self.stack: + self.stack[-1].CheckEnd(filename, clean_lines, linenum, error) + self.stack.pop() + line = matched.group(2) + + def InnermostClass(self): + """Get class info on the top of the stack. + + Returns: + A _ClassInfo object if we are inside a class, or None otherwise. + """ + for i in range(len(self.stack), 0, -1): + classinfo = self.stack[i - 1] + if isinstance(classinfo, _ClassInfo): + return classinfo + return None + + def CheckCompletedBlocks(self, filename, error): + """Checks that all classes and namespaces have been completely parsed. + + Call this when all lines in a file have been processed. + Args: + filename: The name of the current file. + error: The function to call with any errors found. + """ + # Note: This test can result in false positives if #ifdef constructs + # get in the way of brace matching. See the testBuildClass test in + # cpplint_unittest.py for an example of this. + for obj in self.stack: + if isinstance(obj, _ClassInfo): + error(filename, obj.starting_linenum, 'build/class', 5, + 'Failed to find complete declaration of class %s' % + obj.name) + elif isinstance(obj, _NamespaceInfo): + error(filename, obj.starting_linenum, 'build/namespaces', 5, + 'Failed to find complete declaration of namespace %s' % + obj.name) + + +def CheckForNonStandardConstructs(filename, clean_lines, linenum, + nesting_state, error): + r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. + + Complain about several constructs which gcc-2 accepts, but which are + not standard C++. Warning about these in lint is one way to ease the + transition to new compilers. + - put storage class first (e.g. "static const" instead of "const static"). + - "%lld" instead of %qd" in printf-type functions. + - "%1$d" is non-standard in printf-type functions. + - "\%" is an undefined character escape sequence. + - text after #endif is not allowed. + - invalid inner-style forward declaration. + - >? and ?= and )\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', + line): + error(filename, linenum, 'build/deprecated', 3, + '>? and ))?' + # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' + error(filename, linenum, 'runtime/member_string_references', 2, + 'const string& members are dangerous. It is much better to use ' + 'alternatives, such as pointers or simple constants.') + + # Everything else in this function operates on class declarations. + # Return early if the top of the nesting stack is not a class, or if + # the class head is not completed yet. + classinfo = nesting_state.InnermostClass() + if not classinfo or not classinfo.seen_open_brace: + return + + # The class may have been declared with namespace or classname qualifiers. + # The constructor and destructor will not have those qualifiers. + base_classname = classinfo.name.split('::')[-1] + + # Look for single-argument constructors that aren't marked explicit. + # Technically a valid construct, but against style. + args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)' + % re.escape(base_classname), + line) + if (args and + args.group(1) != 'void' and + not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&' + % re.escape(base_classname), args.group(1).strip())): + error(filename, linenum, 'runtime/explicit', 5, + 'Single-argument constructors should be marked explicit.') + + +def CheckSpacingForFunctionCall(filename, line, linenum, error): + """Checks for the correctness of various spacing around function calls. + + Args: + filename: The name of the current file. + line: The text of the line to check. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Since function calls often occur inside if/for/while/switch + # expressions - which have their own, more liberal conventions - we + # first see if we should be looking inside such an expression for a + # function call, to which we can apply more strict standards. + fncall = line # if there's no control flow construct, look at whole line + for pattern in (r'\bif\s*\((.*)\)\s*{', + r'\bfor\s*\((.*)\)\s*{', + r'\bwhile\s*\((.*)\)\s*[{;]', + r'\bswitch\s*\((.*)\)\s*{'): + match = Search(pattern, line) + if match: + fncall = match.group(1) # look inside the parens for function calls + break + + # Except in if/for/while/switch, there should never be space + # immediately inside parens (eg "f( 3, 4 )"). We make an exception + # for nested parens ( (a+b) + c ). Likewise, there should never be + # a space before a ( when it's a function argument. I assume it's a + # function argument when the char before the whitespace is legal in + # a function name (alnum + _) and we're not starting a macro. Also ignore + # pointers and references to arrays and functions coz they're too tricky: + # we use a very simple way to recognize these: + # " (something)(maybe-something)" or + # " (something)(maybe-something," or + # " (something)[something]" + # Note that we assume the contents of [] to be short enough that + # they'll never need to wrap. + if ( # Ignore control structures. + not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', + fncall) and + # Ignore pointers/references to functions. + not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and + # Ignore pointers/references to arrays. + not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): + if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space after ( in function call') + elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space after (') + if (Search(r'\w\s+\(', fncall) and + not Search(r'#\s*define|typedef', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)): + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') + # If the ) is followed only by a newline or a { + newline, assume it's + # part of a control statement (if/while/etc), and don't complain + if Search(r'[^)]\s+\)\s*[^{\s]', fncall): + # If the closing parenthesis is preceded by only whitespaces, + # try to give a more descriptive error message. + if Search(r'^\s+\)', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Closing ) should be moved to the previous line') + else: + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space before )') + + +def IsBlankLine(line): + """Returns true if the given line is blank. + + We consider a line to be blank if the line is empty or consists of + only white spaces. + + Args: + line: A line of a string. + + Returns: + True, if the given line is blank. + """ + return not line or line.isspace() + + +def CheckForFunctionLengths(filename, clean_lines, linenum, + function_state, error): + """Reports for long function bodies. + + For an overview why this is done, see: + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions + + Uses a simplistic algorithm assuming other style guidelines + (especially spacing) are followed. + Only checks unindented functions, so class members are unchecked. + Trivial bodies are unchecked, so constructors with huge initializer lists + may be missed. + Blank/comment lines are not counted so as to avoid encouraging the removal + of vertical space and comments just to get through a lint check. + NOLINT *on the last line of a function* disables this check. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + function_state: Current function name and lines in body so far. + error: The function to call with any errors found. + """ + lines = clean_lines.lines + line = lines[linenum] + raw = clean_lines.raw_lines + raw_line = raw[linenum] + joined_line = '' + + starting_func = False + regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... + match_result = Match(regexp, line) + if match_result: + # If the name is all caps and underscores, figure it's a macro and + # ignore it, unless it's TEST or TEST_F. + function_name = match_result.group(1).split()[-1] + if function_name == 'TEST' or function_name == 'TEST_F' or ( + not Match(r'[A-Z_]+$', function_name)): + starting_func = True + + if starting_func: + body_found = False + for start_linenum in xrange(linenum, clean_lines.NumLines()): + start_line = lines[start_linenum] + joined_line += ' ' + start_line.lstrip() + if Search(r'(;|})', start_line): # Declarations and trivial functions + body_found = True + break # ... ignore + elif Search(r'{', start_line): + body_found = True + function = Search(r'((\w|:)*)\(', line).group(1) + if Match(r'TEST', function): # Handle TEST... macros + parameter_regexp = Search(r'(\(.*\))', joined_line) + if parameter_regexp: # Ignore bad syntax + function += parameter_regexp.group(1) + else: + function += '()' + function_state.Begin(function) + break + if not body_found: + # No body for the function (or evidence of a non-function) was found. + error(filename, linenum, 'readability/fn_size', 5, + 'Lint failed to find start of function body.') + elif Match(r'^\}\s*$', line): # function end + function_state.Check(error, filename, linenum) + function_state.End() + elif not Match(r'^\s*$', line): + function_state.Count() # Count non-blank/non-comment lines. + + +_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') + + +def CheckComment(comment, filename, linenum, error): + """Checks for common mistakes in TODO comments. + + Args: + comment: The text of the comment from the line in question. + filename: The name of the current file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') + + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') + + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') + +def CheckAccess(filename, clean_lines, linenum, nesting_state, error): + """Checks for improper use of DISALLOW* macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] # get rid of comments and strings + + matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' + r'DISALLOW_EVIL_CONSTRUCTORS|' + r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) + if not matched: + return + if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): + if nesting_state.stack[-1].access != 'private': + error(filename, linenum, 'readability/constructors', 3, + '%s must be in the private: section' % matched.group(1)) + + else: + # Found DISALLOW* macro outside a class declaration, or perhaps it + # was used inside a function when it should have been part of the + # class declaration. We could issue a warning here, but it + # probably resulted in a compiler error already. + pass + + +def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix): + """Find the corresponding > to close a template. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Current line number. + init_suffix: Remainder of the current line after the initial <. + + Returns: + True if a matching bracket exists. + """ + line = init_suffix + nesting_stack = ['<'] + while True: + # Find the next operator that can tell us whether < is used as an + # opening bracket or as a less-than operator. We only want to + # warn on the latter case. + # + # We could also check all other operators and terminate the search + # early, e.g. if we got something like this "a(),;\[\]]*([<>(),;\[\]])(.*)$', line) + if match: + # Found an operator, update nesting stack + operator = match.group(1) + line = match.group(2) + + if nesting_stack[-1] == '<': + # Expecting closing angle bracket + if operator in ('<', '(', '['): + nesting_stack.append(operator) + elif operator == '>': + nesting_stack.pop() + if not nesting_stack: + # Found matching angle bracket + return True + elif operator == ',': + # Got a comma after a bracket, this is most likely a template + # argument. We have not seen a closing angle bracket yet, but + # it's probably a few lines later if we look for it, so just + # return early here. + return True + else: + # Got some other operator. + return False + + else: + # Expecting closing parenthesis or closing bracket + if operator in ('<', '(', '['): + nesting_stack.append(operator) + elif operator in (')', ']'): + # We don't bother checking for matching () or []. If we got + # something like (] or [), it would have been a syntax error. + nesting_stack.pop() + + else: + # Scan the next line + linenum += 1 + if linenum >= len(clean_lines.elided): + break + line = clean_lines.elided[linenum] + + # Exhausted all remaining lines and still no matching angle bracket. + # Most likely the input was incomplete, otherwise we should have + # seen a semicolon and returned early. + return True + + +def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix): + """Find the corresponding < that started a template. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Current line number. + init_prefix: Part of the current line before the initial >. + + Returns: + True if a matching bracket exists. + """ + line = init_prefix + nesting_stack = ['>'] + while True: + # Find the previous operator + match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line) + if match: + # Found an operator, update nesting stack + operator = match.group(2) + line = match.group(1) + + if nesting_stack[-1] == '>': + # Expecting opening angle bracket + if operator in ('>', ')', ']'): + nesting_stack.append(operator) + elif operator == '<': + nesting_stack.pop() + if not nesting_stack: + # Found matching angle bracket + return True + elif operator == ',': + # Got a comma before a bracket, this is most likely a + # template argument. The opening angle bracket is probably + # there if we look for it, so just return early here. + return True + else: + # Got some other operator. + return False + + else: + # Expecting opening parenthesis or opening bracket + if operator in ('>', ')', ']'): + nesting_stack.append(operator) + elif operator in ('(', '['): + nesting_stack.pop() + + else: + # Scan the previous line + linenum -= 1 + if linenum < 0: + break + line = clean_lines.elided[linenum] + + # Exhausted all earlier lines and still no matching angle bracket. + return False + + +def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): + """Checks for the correctness of various spacing issues in the code. + + Things we check for: spaces around operators, spaces after + if/for/while/switch, no spaces around parens in function calls, two + spaces between code and comment, don't start a block with a blank + line, don't end a function with a blank line, don't add a blank line + after public/protected/private, don't have too many blank lines in a row. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw = clean_lines.lines_without_raw_strings + line = raw[linenum] + + # Before nixing comments, check if the line is blank for no good + # reason. This includes the first line after a block is opened, and + # blank lines at the end of a function (ie, right before a line like '}' + # + # Skip all the blank line checks if we are immediately inside a + # namespace body. In other words, don't issue blank line warnings + # for this block: + # namespace { + # + # } + # + # A warning about missing end of namespace comments will be issued instead. + if IsBlankLine(line) and not nesting_state.InNamespaceBody(): + elided = clean_lines.elided + prev_line = elided[linenum - 1] + prevbrace = prev_line.rfind('{') + # TODO(unknown): Don't complain if line before blank line, and line after, + # both start with alnums and are indented the same amount. + # This ignores whitespace at the start of a namespace block + # because those are not usually indented. + if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: + # OK, we have a blank line at the start of a code block. Before we + # complain, we check if it is an exception to the rule: The previous + # non-empty line has the parameters of a function header that are indented + # 4 spaces (because they did not fit in a 80 column line when placed on + # the same line as the function name). We also check for the case where + # the previous line is indented 6 spaces, which may happen when the + # initializers of a constructor do not fit into a 80 column line. + exception = False + if Match(r' {6}\w', prev_line): # Initializer list? + # We are looking for the opening column of initializer list, which + # should be indented 4 spaces to cause 6 space indentation afterwards. + search_position = linenum-2 + while (search_position >= 0 + and Match(r' {6}\w', elided[search_position])): + search_position -= 1 + exception = (search_position >= 0 + and elided[search_position][:5] == ' :') + else: + # Search for the function arguments or an initializer list. We use a + # simple heuristic here: If the line is indented 4 spaces; and we have a + # closing paren, without the opening paren, followed by an opening brace + # or colon (for initializer lists) we assume that it is the last line of + # a function header. If we have a colon indented 4 spaces, it is an + # initializer list. + exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', + prev_line) + or Match(r' {4}:', prev_line)) + + if not exception: + error(filename, linenum, 'whitespace/blank_line', 2, + 'Redundant blank line at the start of a code block ' + 'should be deleted.') + # Ignore blank lines at the end of a block in a long if-else + # chain, like this: + # if (condition1) { + # // Something followed by a blank line + # + # } else if (condition2) { + # // Something else + # } + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + if (next_line + and Match(r'\s*}', next_line) + and next_line.find('} else ') == -1): + error(filename, linenum, 'whitespace/blank_line', 3, + 'Redundant blank line at the end of a code block ' + 'should be deleted.') + + matched = Match(r'\s*(public|protected|private):', prev_line) + if matched: + error(filename, linenum, 'whitespace/blank_line', 3, + 'Do not leave a blank line after "%s:"' % matched.group(1)) + + # Next, we complain if there's a comment too near the text + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if (line.count('"', 0, commentpos) - + line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes + # Allow one space for new scopes, two spaces otherwise: + if (not Match(r'^\s*{ //', line) and + ((commentpos >= 1 and + line[commentpos-1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos-2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') + # There should always be a space between the // and the comment + commentend = commentpos + 2 + if commentend < len(line) and not line[commentend] == ' ': + # but some lines are exceptions -- e.g. if they're big + # comment delimiters like: + # //---------------------------------------------------------- + # or are an empty C++ style Doxygen comment, like: + # /// + # or C++ style Doxygen comments placed after the variable: + # ///< Header comment + # //!< Header comment + # or they begin with multiple slashes followed by a space: + # //////// Header comment + match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or + Search(r'^/$', line[commentend:]) or + Search(r'^!< ', line[commentend:]) or + Search(r'^/< ', line[commentend:]) or + Search(r'^/+ ', line[commentend:])) + if not match: + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') + CheckComment(line[commentpos:], filename, linenum, error) + + line = clean_lines.elided[linenum] # get rid of comments and strings + + # Don't try to do spacing checks for operator methods + line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line) + + # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". + # Otherwise not. Note we only check for non-spaces on *both* sides; + # sometimes people put non-spaces on one side when aligning ='s among + # many lines (not that this is behavior that I approve of...) + if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line): + error(filename, linenum, 'whitespace/operators', 4, + 'Missing spaces around =') + + # It's ok not to have spaces around binary operators like + - * /, but if + # there's too little whitespace, we get concerned. It's hard to tell, + # though, so we punt on this one for now. TODO. + + # You should always have whitespace around binary operators. + # + # Check <= and >= first to avoid false positives with < and >, then + # check non-include lines for spacing around < and >. + match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around %s' % match.group(1)) + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # Also ignore using ns::operator<<; + match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line) + if (match and + not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') + elif not Match(r'#.*include', line): + # Avoid false positives on -> + reduced_line = line.replace('->', '') + + # Look for < that is not surrounded by spaces. This is only + # triggered if both sides are missing spaces, even though + # technically should should flag if at least one side is missing a + # space. This is done to avoid some false positives with shifts. + match = Search(r'[^\s<]<([^\s=<].*)', reduced_line) + if (match and + not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') + + # Look for > that is not surrounded by spaces. Similar to the + # above, we only trigger if both sides are missing spaces to avoid + # false positives with shifts. + match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line) + if (match and + not FindPreviousMatchingAngleBracket(clean_lines, linenum, + match.group(1))): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around >> for almost anything. This is because + # C++11 allows ">>" to close nested templates, which accounts for + # most cases when ">>" is not followed by a space. + # + # We still warn on ">>" followed by alpha character, because that is + # likely due to ">>" being used for right shifts, e.g.: + # value >> alpha + # + # When ">>" is used to close templates, the alphanumeric letter that + # follows would be part of an identifier, and there should still be + # a space separating the template type and the identifier. + # type> alpha + match = Search(r'>>[a-zA-Z_]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >>') + + # There shouldn't be space around unary operators + match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) + if match: + error(filename, linenum, 'whitespace/operators', 4, + 'Extra space for operator %s' % match.group(1)) + + # A pet peeve of mine: no spaces after an if, while, switch, or for + match = Search(r' (if\(|for\(|while\(|switch\()', line) + if match: + error(filename, linenum, 'whitespace/parens', 5, + 'Missing space before ( in %s' % match.group(1)) + + # For if/for/while/switch, the left and right parens should be + # consistent about how many spaces are inside the parens, and + # there should either be zero or one spaces inside the parens. + # We don't want: "if ( foo)" or "if ( foo )". + # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. + match = Search(r'\b(if|for|while|switch)\s*' + r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', + line) + if match: + if len(match.group(2)) != len(match.group(4)): + if not (match.group(3) == ';' and + len(match.group(2)) == 1 + len(match.group(4)) or + not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): + error(filename, linenum, 'whitespace/parens', 5, + 'Mismatching spaces inside () in %s' % match.group(1)) + if len(match.group(2)) not in [0, 1]: + error(filename, linenum, 'whitespace/parens', 5, + 'Should have zero or one spaces inside ( and ) in %s' % + match.group(1)) + + # You should always have a space after a comma (either as fn arg or operator) + # + # This does not apply when the non-space character following the + # comma is another comma, since the only time when that happens is + # for empty macro arguments. + # + # We run this check in two passes: first pass on elided lines to + # verify that lines contain missing whitespaces, second pass on raw + # lines to confirm that those missing whitespaces are not due to + # elided comments. + if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]): + error(filename, linenum, 'whitespace/comma', 3, + 'Missing space after ,') + + # You should always have a space after a semicolon + # except for few corner cases + # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more + # space after ; + if Search(r';[^\s};\\)/]', line): + error(filename, linenum, 'whitespace/semicolon', 3, + 'Missing space after ;') + + # Next we will look for issues with function calls. + CheckSpacingForFunctionCall(filename, line, linenum, error) + + # Except after an opening paren, or after another opening brace (in case of + # an initializer list, for instance), you should have spaces before your + # braces. And since you should never have braces at the beginning of a line, + # this is an easy test. + match = Match(r'^(.*[^ ({]){', line) + if match: + # Try a bit harder to check for brace initialization. This + # happens in one of the following forms: + # Constructor() : initializer_list_{} { ... } + # Constructor{}.MemberFunction() + # Type variable{}; + # FunctionCall(type{}, ...); + # LastArgument(..., type{}); + # LOG(INFO) << type{} << " ..."; + # map_of_type[{...}] = ...; + # + # We check for the character following the closing brace, and + # silence the warning if it's one of those listed above, i.e. + # "{.;,)<]". + # + # To account for nested initializer list, we allow any number of + # closing braces up to "{;,)<". We can't simply silence the + # warning on first sight of closing brace, because that would + # cause false negatives for things that are not initializer lists. + # Silence this: But not this: + # Outer{ if (...) { + # Inner{...} if (...){ // Missing space before { + # }; } + # + # There is a false negative with this approach if people inserted + # spurious semicolons, e.g. "if (cond){};", but we will catch the + # spurious semicolon with a separate check. + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + trailing_text = '' + if endpos > -1: + trailing_text = endline[endpos:] + for offset in xrange(endlinenum + 1, + min(endlinenum + 3, clean_lines.NumLines() - 1)): + trailing_text += clean_lines.elided[offset] + if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before {') + + # Make sure '} else {' has spaces. + if Search(r'}else', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before else') + + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []' or 'new char * []'. + if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Extra space before [') + + # You shouldn't have a space before a semicolon at the end of the line. + # There's a special case for "for" since the style guide allows space before + # the semicolon there. + if Search(r':\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Semicolon defining empty statement. Use {} instead.') + elif Search(r'^\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Line contains only semicolon. If this should be an empty statement, ' + 'use {} instead.') + elif (Search(r'\s+;\s*$', line) and + not Search(r'\bfor\b', line)): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Extra space before last semicolon. If this should be an empty ' + 'statement, use {} instead.') + + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search('for *\(.*[^:]:[^: ]', line) or + Search('for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') + + +def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): + """Checks for additional blank line issues related to sections. + + Currently the only thing checked here is blank line before protected/private. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + class_info: A _ClassInfo objects. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Skip checks if the class is small, where small means 25 lines or less. + # 25 lines seems like a good cutoff since that's the usual height of + # terminals, and any class that can't fit in one screen can't really + # be considered "small". + # + # Also skip checks if we are on the first line. This accounts for + # classes that look like + # class Foo { public: ... }; + # + # If we didn't find the end of the class, last_line would be zero, + # and the check will be skipped by the first condition. + if (class_info.last_line - class_info.starting_linenum <= 24 or + linenum <= class_info.starting_linenum): + return + + matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum]) + if matched: + # Issue warning if the line before public/protected/private was + # not a blank line, but don't do this if the previous line contains + # "class" or "struct". This can happen two ways: + # - We are at the beginning of the class. + # - We are forward-declaring an inner class that is semantically + # private, but needed to be public for implementation reasons. + # Also ignores cases where the previous line ends with a backslash as can be + # common when defining classes in C macros. + prev_line = clean_lines.lines[linenum - 1] + if (not IsBlankLine(prev_line) and + not Search(r'\b(class|struct)\b', prev_line) and + not Search(r'\\$', prev_line)): + # Try a bit harder to find the beginning of the class. This is to + # account for multi-line base-specifier lists, e.g.: + # class Derived + # : public Base { + end_class_head = class_info.starting_linenum + for i in range(class_info.starting_linenum, linenum): + if Search(r'\{\s*$', clean_lines.lines[i]): + end_class_head = i + break + if end_class_head < linenum - 1: + error(filename, linenum, 'whitespace/blank_line', 3, + '"%s:" should be preceded by a blank line' % matched.group(1)) + + +def GetPreviousNonBlankLine(clean_lines, linenum): + """Return the most recent non-blank line and its line number. + + Args: + clean_lines: A CleansedLines instance containing the file contents. + linenum: The number of the line to check. + + Returns: + A tuple with two elements. The first element is the contents of the last + non-blank line before the current line, or the empty string if this is the + first non-blank line. The second is the line number of that line, or -1 + if this is the first non-blank line. + """ + + prevlinenum = linenum - 1 + while prevlinenum >= 0: + prevline = clean_lines.elided[prevlinenum] + if not IsBlankLine(prevline): # if not a blank line... + return (prevline, prevlinenum) + prevlinenum -= 1 + return ('', -1) + + +def CheckBraces(filename, clean_lines, linenum, error): + """Looks for misplaced braces (e.g. at the end of line). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] # get rid of comments and strings + + if Match(r'\s*{\s*$', line): + # We allow an open brace to start a line in the case where someone is using + # braces in a block to explicitly create a new scope, which is commonly used + # to control the lifetime of stack-allocated variables. Braces are also + # used for brace initializers inside function calls. We don't detect this + # perfectly: we just don't complain if the last non-whitespace character on + # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the + # previous line starts a preprocessor block. + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if (not Search(r'[,;:}{(]\s*$', prevline) and + not Match(r'\s*#', prevline)): + error(filename, linenum, 'whitespace/braces', 4, + '{ should almost always be at the end of the previous line') + + # An else clause should be on the same line as the preceding closing brace. + if Match(r'\s*else\s*', line): + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if Match(r'\s*}\s*$', prevline): + error(filename, linenum, 'whitespace/newline', 4, + 'An else should appear on the same line as the preceding }') + + # If braces come on one side of an else, they should be on both. + # However, we have to worry about "else if" that spans multiple lines! + if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): + if Search(r'}\s*else if([^{]*)$', line): # could be multi-line if + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + if endline[endpos:].find('{') == -1: # must be brace after if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + else: # common case: else not followed by a multi-line if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + + # Likewise, an else should never have the else clause on the same line + if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): + error(filename, linenum, 'whitespace/newline', 4, + 'Else clause should never be on same line as else (use 2 lines)') + + # In the same way, a do/while should never be on one line + if Match(r'\s*do [^\s{]', line): + error(filename, linenum, 'whitespace/newline', 4, + 'do/while clauses should not be on a single line') + + # Block bodies should not be followed by a semicolon. Due to C++11 + # brace initialization, there are more places where semicolons are + # required than not, so we use a whitelist approach to check these + # rather than a blacklist. These are the places where "};" should + # be replaced by just "}": + # 1. Some flavor of block following closing parenthesis: + # for (;;) {}; + # while (...) {}; + # switch (...) {}; + # Function(...) {}; + # if (...) {}; + # if (...) else if (...) {}; + # + # 2. else block: + # if (...) else {}; + # + # 3. const member function: + # Function(...) const {}; + # + # 4. Block following some statement: + # x = 42; + # {}; + # + # 5. Block at the beginning of a function: + # Function(...) { + # {}; + # } + # + # Note that naively checking for the preceding "{" will also match + # braces inside multi-dimensional arrays, but this is fine since + # that expression will not contain semicolons. + # + # 6. Block following another block: + # while (true) {} + # {}; + # + # 7. End of namespaces: + # namespace {}; + # + # These semicolons seems far more common than other kinds of + # redundant semicolons, possibly due to people converting classes + # to namespaces. For now we do not warn for this case. + # + # Try matching case 1 first. + match = Match(r'^(.*\)\s*)\{', line) + if match: + # Matched closing parenthesis (case 1). Check the token before the + # matching opening parenthesis, and don't warn if it looks like a + # macro. This avoids these false positives: + # - macro that defines a base class + # - multi-line macro that defines a base class + # - macro that defines the whole class-head + # + # But we still issue warnings for macros that we know are safe to + # warn, specifically: + # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P + # - TYPED_TEST + # - INTERFACE_DEF + # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: + # + # We implement a whitelist of safe macros instead of a blacklist of + # unsafe macros, even though the latter appears less frequently in + # google code and would have been easier to implement. This is because + # the downside for getting the whitelist wrong means some extra + # semicolons, while the downside for getting the blacklist wrong + # would result in compile errors. + # + # In addition to macros, we also don't want to warn on compound + # literals. + closing_brace_pos = match.group(1).rfind(')') + opening_parenthesis = ReverseCloseExpression( + clean_lines, linenum, closing_brace_pos) + if opening_parenthesis[2] > -1: + line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] + macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + if ((macro and + macro.group(1) not in ( + 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', + 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', + 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + Search(r'\s+=\s*$', line_prefix)): + match = None + # Whitelist lambda function definition which also requires a ";" after + # closing brace + if match: + if Match(r'^.*\[.*\]\s*(.*\)\s*)\{', line): + match = None + + else: + # Try matching cases 2-3. + match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) + if not match: + # Try matching cases 4-6. These are always matched on separate lines. + # + # Note that we can't simply concatenate the previous line to the + # current line and do a single match, otherwise we may output + # duplicate warnings for the blank line case: + # if (cond) { + # // blank line + # } + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if prevline and Search(r'[;{}]\s*$', prevline): + match = Match(r'^(\s*)\{', line) + + # Check matching closing brace + if match: + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if endpos > -1 and Match(r'^\s*;', endline[endpos:]): + # Current {} pair is eligible for semicolon check, and we have found + # the redundant semicolon, output warning here. + # + # Note: because we are scanning forward for opening braces, and + # outputting warnings for the matching closing brace, if there are + # nested blocks with trailing semicolons, we will get the error + # messages in reversed order. + error(filename, endlinenum, 'readability/braces', 4, + "You don't need a ; after a }") + + +def CheckEmptyBlockBody(filename, clean_lines, linenum, error): + """Look for empty loop/conditional body with only a single semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Search for loop keywords at the beginning of the line. Because only + # whitespaces are allowed before the keywords, this will also ignore most + # do-while-loops, since those lines should start with closing brace. + # + # We also check "if" blocks here, since an empty conditional block + # is likely an error. + line = clean_lines.elided[linenum] + matched = Match(r'\s*(for|while|if)\s*\(', line) + if matched: + # Find the end of the conditional expression + (end_line, end_linenum, end_pos) = CloseExpression( + clean_lines, linenum, line.find('(')) + + # Output warning if what follows the condition expression is a semicolon. + # No warning for all other cases, including whitespace or newline, since we + # have a separate check for semicolons preceded by whitespace. + if end_pos >= 0 and Match(r';', end_line[end_pos:]): + if matched.group(1) == 'if': + error(filename, end_linenum, 'whitespace/empty_conditional_body', 5, + 'Empty conditional bodies should use {}') + else: + error(filename, end_linenum, 'whitespace/empty_loop_body', 5, + 'Empty loop bodies should use {} or continue') + + +def CheckCheck(filename, clean_lines, linenum, error): + """Checks the use of CHECK and EXPECT macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Decide the set of replacement macros that should be suggested + lines = clean_lines.elided + check_macro = None + start_pos = -1 + for macro in _CHECK_MACROS: + i = lines[linenum].find(macro) + if i >= 0: + check_macro = macro + + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum]) + if not matched: + continue + start_pos = len(matched.group(1)) + break + if not check_macro or start_pos < 0: + # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT' + return + + # Find end of the boolean expression by matching parentheses + (last_line, end_line, end_pos) = CloseExpression( + clean_lines, linenum, start_pos) + if end_pos < 0: + return + if linenum == end_line: + expression = lines[linenum][start_pos + 1:end_pos - 1] + else: + expression = lines[linenum][start_pos + 1:] + for i in xrange(linenum + 1, end_line): + expression += lines[i] + expression += last_line[0:end_pos - 1] + + # Parse expression so that we can take parentheses into account. + # This avoids false positives for inputs like "CHECK((a < 4) == b)", + # which is not replaceable by CHECK_LE. + lhs = '' + rhs = '' + operator = None + while expression: + matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' + r'==|!=|>=|>|<=|<|\()(.*)$', expression) + if matched: + token = matched.group(1) + if token == '(': + # Parenthesized operand + expression = matched.group(2) + (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')') + if end < 0: + return # Unmatched parenthesis + lhs += '(' + expression[0:end] + expression = expression[end:] + elif token in ('&&', '||'): + # Logical and/or operators. This means the expression + # contains more than one term, for example: + # CHECK(42 < a && a < b); + # + # These are not replaceable with CHECK_LE, so bail out early. + return + elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): + # Non-relational operator + lhs += token + expression = matched.group(2) + else: + # Relational operator + operator = token + rhs = matched.group(2) + break + else: + # Unparenthesized operand. Instead of appending to lhs one character + # at a time, we do another regular expression match to consume several + # characters at once if possible. Trivial benchmark shows that this + # is more efficient when the operands are longer than a single + # character, which is generally the case. + matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) + if not matched: + matched = Match(r'^(\s*\S)(.*)$', expression) + if not matched: + break + lhs += matched.group(1) + expression = matched.group(2) + + # Only apply checks if we got all parts of the boolean expression + if not (lhs and operator and rhs): + return + + # Check that rhs do not contain logical operators. We already know + # that lhs is fine since the loop above parses out && and ||. + if rhs.find('&&') > -1 or rhs.find('||') > -1: + return + + # At least one of the operands must be a constant literal. This is + # to avoid suggesting replacements for unprintable things like + # CHECK(variable != iterator) + # + # The following pattern matches decimal, hex integers, strings, and + # characters (in that order). + lhs = lhs.strip() + rhs = rhs.strip() + match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' + if Match(match_constant, lhs) or Match(match_constant, rhs): + # Note: since we know both lhs and rhs, we can provide a more + # descriptive error message like: + # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) + # Instead of: + # Consider using CHECK_EQ instead of CHECK(a == b) + # + # We are still keeping the less descriptive message because if lhs + # or rhs gets long, the error message might become unreadable. + error(filename, linenum, 'readability/check', 2, + 'Consider using %s instead of %s(a %s b)' % ( + _CHECK_REPLACEMENT[check_macro][operator], + check_macro, operator)) + + +def CheckAltTokens(filename, clean_lines, linenum, error): + """Check alternative keywords being used in boolean expressions. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Avoid preprocessor lines + if Match(r'^\s*#', line): + return + + # Last ditch effort to avoid multi-line comments. This will not help + # if the comment started before the current line or ended after the + # current line, but it catches most of the false positives. At least, + # it provides a way to workaround this warning for people who use + # multi-line comments in preprocessor macros. + # + # TODO(unknown): remove this once cpplint has better support for + # multi-line comments. + if line.find('/*') >= 0 or line.find('*/') >= 0: + return + + for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): + error(filename, linenum, 'readability/alt_tokens', 2, + 'Use operator %s instead of %s' % ( + _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) + + +def GetLineWidth(line): + """Determines the width of the line in column positions. + + Args: + line: A string, which may be a Unicode string. + + Returns: + The width of the line in column positions, accounting for Unicode + combining characters and wide characters. + """ + if isinstance(line, unicode): + width = 0 + for uc in unicodedata.normalize('NFC', line): + if unicodedata.east_asian_width(uc) in ('W', 'F'): + width += 2 + elif not unicodedata.combining(uc): + width += 1 + return width + else: + return len(line) + + +def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, + error): + """Checks rules from the 'C++ style rules' section of cppguide.html. + + Most of these rules are hard to test (naming, comment style), but we + do what we can. In particular we check for 2-space indents, line lengths, + tab usage, spaces inside code, etc. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw_lines = clean_lines.lines_without_raw_strings + line = raw_lines[linenum] + + if line.find('\t') != -1: + error(filename, linenum, 'whitespace/tab', 1, + 'Tab found; better to use spaces') + + # One or three blank spaces at the beginning of the line is weird; it's + # hard to reconcile that with 2-space indents. + # NOTE: here are the conditions rob pike used for his tests. Mine aren't + # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces + # if(RLENGTH > 20) complain = 0; + # if(match($0, " +(error|private|public|protected):")) complain = 0; + # if(match(prev, "&& *$")) complain = 0; + # if(match(prev, "\\|\\| *$")) complain = 0; + # if(match(prev, "[\",=><] *$")) complain = 0; + # if(match($0, " <<")) complain = 0; + # if(match(prev, " +for \\(")) complain = 0; + # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + initial_spaces = 0 + cleansed_line = clean_lines.elided[linenum] + while initial_spaces < len(line) and line[initial_spaces] == ' ': + initial_spaces += 1 + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # There are certain situations we allow one space, notably for section labels + elif ((initial_spaces == 1 or initial_spaces == 3) and + not Match(r'\s*\w+\s*:\s*$', cleansed_line)): + error(filename, linenum, 'whitespace/indent', 3, + 'Weird number of spaces at line-start. ' + 'Are you using a 2-space indent?') + + # Check if the line is a header guard. + is_header_guard = False + if file_extension == 'h': + cppvar = GetHeaderGuardCPPVariable(filename) + if (line.startswith('#ifndef %s' % cppvar) or + line.startswith('#define %s' % cppvar) or + line.startswith('#endif // %s' % cppvar)): + is_header_guard = True + # #include lines and header guards can be long, since there's no clean way to + # split them. + # + # URLs can be long too. It's possible to split these, but it makes them + # harder to cut&paste. + # + # The "$Id:...$" comment may also get very long without it being the + # developers fault. + if (not line.startswith('#include') and not is_header_guard and + not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): + line_width = GetLineWidth(line) + extended_length = int((_line_length * 1.25)) + if line_width > extended_length: + error(filename, linenum, 'whitespace/line_length', 4, + 'Lines should very rarely be longer than %i characters' % + extended_length) + elif line_width > _line_length: + error(filename, linenum, 'whitespace/line_length', 2, + 'Lines should be <= %i characters long' % _line_length) + + if (cleansed_line.count(';') > 1 and + # for loops are allowed two ;'s (and may run over two lines). + cleansed_line.find('for') == -1 and + (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or + GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and + # It's ok to have many commands in a switch case that fits in 1 line + not ((cleansed_line.find('case ') != -1 or + cleansed_line.find('default:') != -1) and + cleansed_line.find('break;') != -1)): + error(filename, linenum, 'whitespace/newline', 0, + 'More than one command on the same line') + + # Some more style checks + CheckBraces(filename, clean_lines, linenum, error) + CheckEmptyBlockBody(filename, clean_lines, linenum, error) + CheckAccess(filename, clean_lines, linenum, nesting_state, error) + CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckCheck(filename, clean_lines, linenum, error) + CheckAltTokens(filename, clean_lines, linenum, error) + classinfo = nesting_state.InnermostClass() + if classinfo: + CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) + + +_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"') +_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') +# Matches the first component of a filename delimited by -s and _s. That is: +# _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo' +_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+') + + +def _DropCommonSuffixes(filename): + """Drops common suffixes like _test.cc or -inl.h from filename. + + For example: + >>> _DropCommonSuffixes('foo/foo-inl.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/bar/foo.cc') + 'foo/bar/foo' + >>> _DropCommonSuffixes('foo/foo_internal.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/foo_unusualinternal.h') + 'foo/foo_unusualinternal' + + Args: + filename: The input filename. + + Returns: + The filename with the common suffix removed. + """ + for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', + 'inl.h', 'impl.h', 'internal.h'): + if (filename.endswith(suffix) and len(filename) > len(suffix) and + filename[-len(suffix) - 1] in ('-', '_')): + return filename[:-len(suffix) - 1] + return os.path.splitext(filename)[0] + + +def _IsTestFilename(filename): + """Determines if the given filename has a suffix that identifies it as a test. + + Args: + filename: The input filename. + + Returns: + True if 'filename' looks like a test, False otherwise. + """ + if (filename.endswith('_test.cc') or + filename.endswith('_unittest.cc') or + filename.endswith('_regtest.cc')): + return True + else: + return False + + +def _ClassifyInclude(fileinfo, include, is_system): + """Figures out what kind of header 'include' is. + + Args: + fileinfo: The current file cpplint is running over. A FileInfo instance. + include: The path to a #included file. + is_system: True if the #include used <> rather than "". + + Returns: + One of the _XXX_HEADER constants. + + For example: + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True) + _C_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True) + _CPP_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False) + _LIKELY_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'), + ... 'bar/foo_other_ext.h', False) + _POSSIBLE_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False) + _OTHER_HEADER + """ + # This is a list of all standard c++ header files, except + # those already checked for above. + is_cpp_h = include in _CPP_HEADERS + + if is_system: + if is_cpp_h: + return _CPP_SYS_HEADER + else: + return _C_SYS_HEADER + + # If the target file and the include we're checking share a + # basename when we drop common extensions, and the include + # lives in . , then it's likely to be owned by the target file. + target_dir, target_base = ( + os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) + include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) + if target_base == include_base and ( + include_dir == target_dir or + include_dir == os.path.normpath(target_dir + '/../public')): + return _LIKELY_MY_HEADER + + # If the target and include share some initial basename + # component, it's possible the target is implementing the + # include, so it's allowed to be first, but we'll never + # complain if it's not there. + target_first_component = _RE_FIRST_COMPONENT.match(target_base) + include_first_component = _RE_FIRST_COMPONENT.match(include_base) + if (target_first_component and include_first_component and + target_first_component.group(0) == + include_first_component.group(0)): + return _POSSIBLE_MY_HEADER + + return _OTHER_HEADER + + + +def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): + """Check rules that are applicable to #include lines. + + Strings on #include lines are NOT removed from elided line, to make + certain tasks easier. However, to prevent false positives, checks + applicable to #include lines in CheckLanguage must be put here. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + include_state: An _IncludeState instance in which the headers are inserted. + error: The function to call with any errors found. + """ + fileinfo = FileInfo(filename) + + line = clean_lines.lines[linenum] + + # "include" should use the new style "foo/bar.h" instead of just "bar.h" + if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line): + error(filename, linenum, 'build/include', 4, + 'Include the directory when naming .h files') + + # we shouldn't include a file more than once. actually, there are a + # handful of instances where doing so is okay, but in general it's + # not. + match = _RE_PATTERN_INCLUDE.search(line) + if match: + include = match.group(2) + is_system = (match.group(1) == '<') + if include in include_state: + error(filename, linenum, 'build/include', 4, + '"%s" already included at %s:%s' % + (include, filename, include_state[include])) + else: + include_state[include] = linenum + + # We want to ensure that headers appear in the right order: + # 1) for foo.cc, foo.h (preferred location) + # 2) c system files + # 3) cpp system files + # 4) for foo.cc, foo.h (deprecated location) + # 5) other google headers + # + # We classify each include statement as one of those 5 types + # using a number of techniques. The include_state object keeps + # track of the highest type seen, and complains if we see a + # lower type after that. + error_message = include_state.CheckNextIncludeOrder( + _ClassifyInclude(fileinfo, include, is_system)) + if error_message: + error(filename, linenum, 'build/include_order', 4, + '%s. Should be: %s.h, c system, c++ system, other.' % + (error_message, fileinfo.BaseName())) + canonical_include = include_state.CanonicalizeAlphabeticalOrder(include) + if not include_state.IsInAlphabeticalOrder( + clean_lines, linenum, canonical_include): + error(filename, linenum, 'build/include_alpha', 4, + 'Include "%s" not in alphabetical order' % include) + include_state.SetLastHeader(canonical_include) + + # Look for any of the stream classes that are part of standard C++. + match = _RE_PATTERN_INCLUDE.match(line) + if match: + include = match.group(2) + if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include): + # Many unit tests use cout, so we exempt them. + if not _IsTestFilename(filename): + error(filename, linenum, 'readability/streams', 3, + 'Streams are highly discouraged.') + + +def _GetTextInside(text, start_pattern): + r"""Retrieves all the text between matching open and close parentheses. + + Given a string of lines and a regular expression string, retrieve all the text + following the expression and between opening punctuation symbols like + (, [, or {, and the matching close-punctuation symbol. This properly nested + occurrences of the punctuations, so for the text like + printf(a(), b(c())); + a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'. + start_pattern must match string having an open punctuation symbol at the end. + + Args: + text: The lines to extract text. Its comments and strings must be elided. + It can be single line and can span multiple lines. + start_pattern: The regexp string indicating where to start extracting + the text. + Returns: + The extracted text. + None if either the opening string or ending punctuation could not be found. + """ + # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably + # rewritten to use _GetTextInside (and use inferior regexp matching today). + + # Give opening punctuations to get the matching close-punctuations. + matching_punctuation = {'(': ')', '{': '}', '[': ']'} + closing_punctuation = set(matching_punctuation.itervalues()) + + # Find the position to start extracting text. + match = re.search(start_pattern, text, re.M) + if not match: # start_pattern not found in text. + return None + start_position = match.end(0) + + assert start_position > 0, ( + 'start_pattern must ends with an opening punctuation.') + assert text[start_position - 1] in matching_punctuation, ( + 'start_pattern must ends with an opening punctuation.') + # Stack of closing punctuations we expect to have in text after position. + punctuation_stack = [matching_punctuation[text[start_position - 1]]] + position = start_position + while punctuation_stack and position < len(text): + if text[position] == punctuation_stack[-1]: + punctuation_stack.pop() + elif text[position] in closing_punctuation: + # A closing punctuation without matching opening punctuations. + return None + elif text[position] in matching_punctuation: + punctuation_stack.append(matching_punctuation[text[position]]) + position += 1 + if punctuation_stack: + # Opening punctuations left without matching close-punctuations. + return None + # punctuations match. + return text[start_position:position - 1] + + +# Patterns for matching call-by-reference parameters. +# +# Supports nested templates up to 2 levels deep using this messy pattern: +# < (?: < (?: < [^<>]* +# > +# | [^<>] )* +# > +# | [^<>] )* +# > +_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*' # =~ [[:alpha:]][[:alnum:]]* +_RE_PATTERN_TYPE = ( + r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?' + r'(?:\w|' + r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|' + r'::)+') +# A call-by-reference parameter ends with '& identifier'. +_RE_PATTERN_REF_PARAM = re.compile( + r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*' + r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]') +# A call-by-const-reference parameter either ends with 'const& identifier' +# or looks like 'const type& identifier' when 'type' is atomic. +_RE_PATTERN_CONST_REF_PARAM = ( + r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + + r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') + + +def CheckLanguage(filename, clean_lines, linenum, file_extension, + include_state, nesting_state, error): + """Checks rules from the 'C++ language rules' section of cppguide.html. + + Some of these rules are hard to test (function overloading, using + uint32 inappropriately), but we do the best we can. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + include_state: An _IncludeState instance in which the headers are inserted. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # If the line is empty or consists of entirely a comment, no need to + # check it. + line = clean_lines.elided[linenum] + if not line: + return + + match = _RE_PATTERN_INCLUDE.search(line) + if match: + CheckIncludeLine(filename, clean_lines, linenum, include_state, error) + return + + # Reset include state across preprocessor directives. This is meant + # to silence warnings for conditional includes. + if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line): + include_state.ResetSection() + + # Make Windows paths like Unix. + fullname = os.path.abspath(filename).replace('\\', '/') + + # TODO(unknown): figure out if they're using default arguments in fn proto. + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search( + r'(\bnew\s+)?\b' # Grab 'new' operator, if it's there + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + if match: + matched_new = match.group(1) + matched_type = match.group(2) + matched_funcptr = match.group(3) + + # gMock methods are defined using some variant of MOCK_METHODx(name, type) + # where type may be float(), int(string), etc. Without context they are + # virtually indistinguishable from int(x) casts. Likewise, gMock's + # MockCallback takes a template parameter of the form return_type(arg_type), + # which looks much like the cast we're trying to detect. + # + # std::function<> wrapper has a similar problem. + # + # Return types for function pointers also look like casts if they + # don't have an extra space. + if (matched_new is None and # If new operator, then this isn't a cast + not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + Search(r'\bMockCallback<.*>', line) or + Search(r'\bstd::function<.*>', line)) and + not (matched_funcptr and + Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr))): + # Try a bit harder to catch gmock lines: the only place where + # something looks like an old-style cast is where we declare the + # return type of the mocked method, and the only time when we + # are missing context is if MOCK_METHOD was split across + # multiple lines. The missing MOCK_METHOD is usually one or two + # lines back, so scan back one or two lines. + # + # It's not possible for gmock macros to appear in the first 2 + # lines, since the class head + section name takes up 2 lines. + if (linenum < 2 or + not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]))): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % + matched_type) + + CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + + # This doesn't catch all cases. Consider (const char * const)"hello". + # + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'const_cast', r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], + 'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. + match = Search( + r'(?:&\(([^)]+)\)[\w(])|' + r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line) + if match and match.group(1) != '*': + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) + + # Create an extended_line, which is the concatenation of the current and + # next lines, for more effective checking of code that may span more than one + # line. + if linenum + 1 < clean_lines.NumLines(): + extended_line = line + clean_lines.elided[linenum + 1] + else: + extended_line = line + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access. + match = Match( + r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', + line) + # Make sure it's not a function. + # Function template specialization looks like: "string foo(...". + # Class template definitions look like: "string Foo::Method(...". + # + # Also ignore things that look like operators. These are matched separately + # because operator names cross non-word boundaries. If we change the pattern + # above, we would decrease the accuracy of matching identifiers. + if (match and + not Search(r'\boperator\W', line) and + not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))): + error(filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string instead: ' + '"%schar %s[]".' % + (match.group(1), match.group(2))) + + if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') + + if file_extension == 'h': + # TODO(unknown): check that 1-arg constructors are explicit. + # How to tell it's a constructor? + # (handled in CheckForNonStandardConstructs for now) + # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS + # (level 1 error) + pass + + # Check if people are using the verboten C basic types. The only exception + # we regularly allow is "unsigned short port" for port. + if Search(r'\bshort port\b', line): + if not Search(r'\bunsigned short port\b', line): + error(filename, linenum, 'runtime/int', 4, + 'Use "unsigned short" for ports, not "short"') + else: + match = Search(r'\b(short|long(?! +double)|long long)\b', line) + if match: + error(filename, linenum, 'runtime/int', 4, + 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\b', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\b', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) + + # Check if some verboten operator overloading is going on + # TODO(unknown): catch out-of-line unary operator&: + # class X {}; + # int operator&(const X& x) { return 42; } // unary operator& + # The trick is it's hard to tell apart from binary operator&: + # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& + if Search(r'\boperator\s*&\s*\(\s*\)', line): + error(filename, linenum, 'runtime/operator', 4, + 'Unary operator& is dangerous. Do not use it.') + + # Check for suspicious usage of "if" like + # } if (a == b) { + if Search(r'\}\s*if\s*\(', line): + error(filename, linenum, 'readability/braces', 4, + 'Did you mean "else if"? If not, start a new line for "if".') + + # Check for potential format string bugs like printf(foo). + # We constrain the pattern not to pick things like DocidForPrintf(foo). + # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) + # TODO(sugawarayu): Catch the following case. Need to change the calling + # convention of the whole function to process multiple line to handle it. + # printf( + # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); + printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') + if printf_args: + match = Match(r'([\w.\->()]+)$', printf_args) + if match and match.group(1) != '__VA_ARGS__': + function_name = re.search(r'\b((?:string)?printf)\s*\(', + line, re.I).group(1) + error(filename, linenum, 'runtime/printf', 4, + 'Potential format string bug. Do %s("%%s", %s) instead.' + % (function_name, match.group(1))) + + # Check for potential memset bugs like memset(buf, sizeof(buf), 0). + match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) + if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): + error(filename, linenum, 'runtime/memset', 4, + 'Did you mean "memset(%s, 0, %s)"?' + % (match.group(1), match.group(2))) + + if Search(r'\busing namespace\b', line): + error(filename, linenum, 'build/namespaces', 5, + 'Do not use namespace using-directives. ' + 'Use using-declarations instead.') + + # Detect variable-length arrays. + match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) + if (match and match.group(2) != 'return' and match.group(2) != 'delete' and + match.group(3).find(']') == -1): + # Split the size using space and arithmetic operators as delimiters. + # If any of the resulting tokens are not compile time constants then + # report the error. + tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) + is_const = True + skip_next = False + for tok in tokens: + if skip_next: + skip_next = False + continue + + if Search(r'sizeof\(.+\)', tok): continue + if Search(r'arraysize\(\w+\)', tok): continue + + tok = tok.lstrip('(') + tok = tok.rstrip(')') + if not tok: continue + if Match(r'\d+', tok): continue + if Match(r'0[xX][0-9a-fA-F]+', tok): continue + if Match(r'k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue + # A catch all for tricky sizeof cases, including 'sizeof expression', + # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' + # requires skipping the next token because we split on ' ' and '*'. + if tok.startswith('sizeof'): + skip_next = True + continue + is_const = False + break + if not is_const: + error(filename, linenum, 'runtime/arrays', 1, + 'Do not use variable-length arrays. Use an appropriately named ' + "('k' followed by CamelCase) compile-time constant for the size.") + + # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or + # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing + # in the class declaration. + match = Match( + (r'\s*' + r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))' + r'\(.*\);$'), + line) + if match and linenum + 1 < clean_lines.NumLines(): + next_line = clean_lines.elided[linenum + 1] + # We allow some, but not all, declarations of variables to be present + # in the statement that defines the class. The [\w\*,\s]* fragment of + # the regular expression below allows users to declare instances of + # the class or pointers to instances, but not less common types such + # as function pointers or arrays. It's a tradeoff between allowing + # reasonable code and avoiding trying to parse more C++ using regexps. + if not Search(r'^\s*}[\w\*,\s]*;', next_line): + error(filename, linenum, 'readability/constructors', 3, + match.group(1) + ' should be the last thing in the class') + + # Check for use of unnamed namespaces in header files. Registration + # macros are typically OK, so we allow use of "namespace {" on lines + # that end with backslashes. + if (file_extension == 'h' + and Search(r'\bnamespace\s*{', line) + and line[-1] != '\\'): + error(filename, linenum, 'build/namespaces', 4, + 'Do not use unnamed namespaces in header files. See ' + 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + ' for more information.') + +def CheckForNonConstReference(filename, clean_lines, linenum, + nesting_state, error): + """Check for non-const references. + + Separate from CheckLanguage since it scans backwards from current + line, instead of scanning forward. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # Do nothing if there is no '&' on current line. + line = clean_lines.elided[linenum] + if '&' not in line: + return + + # Long type names may be broken across multiple lines, usually in one + # of these forms: + # LongType + # ::LongTypeContinued &identifier + # LongType:: + # LongTypeContinued &identifier + # LongType< + # ...>::LongTypeContinued &identifier + # + # If we detected a type split across two lines, join the previous + # line to current line so that we can match const references + # accordingly. + # + # Note that this only scans back one line, since scanning back + # arbitrary number of lines would be expensive. If you have a type + # that spans more than 2 lines, please use a typedef. + if linenum > 1: + previous = None + if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): + # previous_line\n + ::current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', + clean_lines.elided[linenum - 1]) + elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): + # previous_line::\n + current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', + clean_lines.elided[linenum - 1]) + if previous: + line = previous.group(1) + line.lstrip() + else: + # Check for templated parameter that is split across multiple lines + endpos = line.rfind('>') + if endpos > -1: + (_, startline, startpos) = ReverseCloseExpression( + clean_lines, linenum, endpos) + if startpos > -1 and startline < linenum: + # Found the matching < on an earlier line, collect all + # pieces up to current line. + line = '' + for i in xrange(startline, linenum + 1): + line += clean_lines.elided[i].strip() + + # Check for non-const references in function parameters. A single '&' may + # found in the following places: + # inside expression: binary & for bitwise AND + # inside expression: unary & for taking the address of something + # inside declarators: reference parameter + # We will exclude the first two cases by checking that we are not inside a + # function body, including one that was just introduced by a trailing '{'. + # TODO(unknwon): Doesn't account for preprocessor directives. + # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. + check_params = False + if not nesting_state.stack: + check_params = True # top level + elif (isinstance(nesting_state.stack[-1], _ClassInfo) or + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + check_params = True # within class or namespace + elif Match(r'.*{\s*$', line): + if (len(nesting_state.stack) == 1 or + isinstance(nesting_state.stack[-2], _ClassInfo) or + isinstance(nesting_state.stack[-2], _NamespaceInfo)): + check_params = True # just opened global/class/namespace block + # We allow non-const references in a few standard places, like functions + # called "swap()" or iostream operators like "<<" or ">>". Do not check + # those function parameters. + # + # We also accept & in static_assert, which looks like a function but + # it's actually a declaration expression. + whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + r'operator\s*[<>][<>]|' + r'static_assert|COMPILE_ASSERT' + r')\s*\(') + if Search(whitelisted_functions, line): + check_params = False + elif not Search(r'\S+\([^)]*$', line): + # Don't see a whitelisted function on this line. Actually we + # didn't see any function name on this line, so this is likely a + # multi-line parameter list. Try a bit harder to catch this case. + for i in xrange(2): + if (linenum > i and + Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): + check_params = False + break + + if check_params: + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + + ReplaceAll(' *<', '<', parameter)) + + +def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, + error): + """Checks for a C-style cast by looking for the pattern. + + Args: + filename: The name of the current file. + linenum: The number of the line to check. + line: The line of code to check. + raw_line: The raw line of code to check, with comments. + cast_type: The string for the C++ cast to recommend. This is either + reinterpret_cast, static_cast, or const_cast, depending. + pattern: The regular expression used to find C-style casts. + error: The function to call with any errors found. + + Returns: + True if an error was emitted. + False otherwise. + """ + match = Search(pattern, line) + if not match: + return False + + # Exclude lines with sizeof, since sizeof looks like a cast. + sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1]) + if sizeof_match: + return False + + # operator++(int) and operator--(int) + if (line[0:match.start(1) - 1].endswith(' operator++') or + line[0:match.start(1) - 1].endswith(' operator--')): + return False + + # A single unnamed argument for a function tends to look like old + # style cast. If we see those, don't issue warnings for deprecated + # casts, instead issue warnings for unnamed arguments where + # appropriate. + # + # These are things that we want warnings for, since the style guide + # explicitly require all parameters to be named: + # Function(int); + # Function(int) { + # ConstMember(int) const; + # ConstMember(int) const { + # ExceptionMember(int) throw (...); + # ExceptionMember(int) throw (...) { + # PureVirtual(int) = 0; + # + # These are functions of some sort, where the compiler would be fine + # if they had named parameters, but people often omit those + # identifiers to reduce clutter: + # (FunctionPointer)(int); + # (FunctionPointer)(int) = value; + # Function((function_pointer_arg)(int)) + # ; + # <(FunctionPointerTemplateArgument)(int)>; + remainder = line[match.end(0):] + if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder): + # Looks like an unnamed parameter. + + # Don't warn on any kind of template arguments. + if Match(r'^\s*>', remainder): + return False + + # Don't warn on assignments to function pointers, but keep warnings for + # unnamed parameters to pure virtual functions. Note that this pattern + # will also pass on assignments of "0" to function pointers, but the + # preferred values for those would be "nullptr" or "NULL". + matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) + if matched_zero and matched_zero.group(1) != '0': + return False + + # Don't warn on function pointer declarations. For this we need + # to check what came before the "(type)" string. + if Match(r'.*\)\s*$', line[0:match.start(0)]): + return False + + # Don't warn if the parameter is named with block comments, e.g.: + # Function(int /*unused_param*/); + if '/*' in raw_line: + return False + + # Passed all filters, issue warning here. + error(filename, linenum, 'readability/function', 3, + 'All parameters should be named in a function') + return True + + # At this point, all that should be left is actual casts. + error(filename, linenum, 'readability/casting', 4, + 'Using C-style cast. Use %s<%s>(...) instead' % + (cast_type, match.group(1))) + + return True + + +_HEADERS_CONTAINING_TEMPLATES = ( + ('', ('deque',)), + ('', ('unary_function', 'binary_function', + 'plus', 'minus', 'multiplies', 'divides', 'modulus', + 'negate', + 'equal_to', 'not_equal_to', 'greater', 'less', + 'greater_equal', 'less_equal', + 'logical_and', 'logical_or', 'logical_not', + 'unary_negate', 'not1', 'binary_negate', 'not2', + 'bind1st', 'bind2nd', + 'pointer_to_unary_function', + 'pointer_to_binary_function', + 'ptr_fun', + 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t', + 'mem_fun_ref_t', + 'const_mem_fun_t', 'const_mem_fun1_t', + 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t', + 'mem_fun_ref', + )), + ('', ('numeric_limits',)), + ('', ('list',)), + ('', ('map', 'multimap',)), + ('', ('allocator',)), + ('', ('queue', 'priority_queue',)), + ('', ('set', 'multiset',)), + ('', ('stack',)), + ('', ('char_traits', 'basic_string',)), + ('', ('pair',)), + ('', ('vector',)), + + # gcc extensions. + # Note: std::hash is their hash, ::hash is our hash + ('', ('hash_map', 'hash_multimap',)), + ('', ('hash_set', 'hash_multiset',)), + ('', ('slist',)), + ) + +_RE_PATTERN_STRING = re.compile(r'\bstring\b') + +_re_pattern_algorithm_header = [] +for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', + 'transform'): + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_algorithm_header.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), + _template, + '')) + +_re_pattern_templates = [] +for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: + for _template in _templates: + _re_pattern_templates.append( + (re.compile(r'(\<|\b)' + _template + r'\s*\<'), + _template + '<>', + _header)) + + +def FilesBelongToSameModule(filename_cc, filename_h): + """Check if these two filenames belong to the same module. + + The concept of a 'module' here is a as follows: + foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the + same 'module' if they are in the same directory. + some/path/public/xyzzy and some/path/internal/xyzzy are also considered + to belong to the same module here. + + If the filename_cc contains a longer path than the filename_h, for example, + '/absolute/path/to/base/sysinfo.cc', and this file would include + 'base/sysinfo.h', this function also produces the prefix needed to open the + header. This is used by the caller of this function to more robustly open the + header file. We don't have access to the real include paths in this context, + so we need this guesswork here. + + Known bugs: tools/base/bar.cc and base/bar.h belong to the same module + according to this implementation. Because of this, this function gives + some false positives. This should be sufficiently rare in practice. + + Args: + filename_cc: is the path for the .cc file + filename_h: is the path for the header path + + Returns: + Tuple with a bool and a string: + bool: True if filename_cc and filename_h belong to the same module. + string: the additional prefix needed to open the header file. + """ + + if not filename_cc.endswith('.cc'): + return (False, '') + filename_cc = filename_cc[:-len('.cc')] + if filename_cc.endswith('_unittest'): + filename_cc = filename_cc[:-len('_unittest')] + elif filename_cc.endswith('_test'): + filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc.replace('/public/', '/') + filename_cc = filename_cc.replace('/internal/', '/') + + if not filename_h.endswith('.h'): + return (False, '') + filename_h = filename_h[:-len('.h')] + if filename_h.endswith('-inl'): + filename_h = filename_h[:-len('-inl')] + filename_h = filename_h.replace('/public/', '/') + filename_h = filename_h.replace('/internal/', '/') + + files_belong_to_same_module = filename_cc.endswith(filename_h) + common_path = '' + if files_belong_to_same_module: + common_path = filename_cc[:-len(filename_h)] + return files_belong_to_same_module, common_path + + +def UpdateIncludeState(filename, include_state, io=codecs): + """Fill up the include_state with new includes found from the file. + + Args: + filename: the name of the header to read. + include_state: an _IncludeState instance in which the headers are inserted. + io: The io factory to use to read the file. Provided for testability. + + Returns: + True if a header was successfully added. False otherwise. + """ + headerfile = None + try: + headerfile = io.open(filename, 'r', 'utf8', 'replace') + except IOError: + return False + linenum = 0 + for line in headerfile: + linenum += 1 + clean_line = CleanseComments(line) + match = _RE_PATTERN_INCLUDE.search(clean_line) + if match: + include = match.group(2) + # The value formatting is cute, but not really used right now. + # What matters here is that the key is in include_state. + include_state.setdefault(include, '%s:%d' % (filename, linenum)) + return True + + +def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, + io=codecs): + """Reports for missing stl includes. + + This function will output warnings to make sure you are including the headers + necessary for the stl containers and functions that you use. We only give one + reason to include a header. For example, if you use both equal_to<> and + less<> in a .h file, only one (the latter in the file) of these will be + reported as a reason to include the . + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + include_state: An _IncludeState instance. + error: The function to call with any errors found. + io: The IO factory to use to read the header file. Provided for unittest + injection. + """ + required = {} # A map of header name to linenumber and the template entity. + # Example of required: { '': (1219, 'less<>') } + + for linenum in xrange(clean_lines.NumLines()): + line = clean_lines.elided[linenum] + if not line or line[0] == '#': + continue + + # String is special -- it is a non-templatized type in STL. + matched = _RE_PATTERN_STRING.search(line) + if matched: + # Don't warn about strings in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[''] = (linenum, 'string') + + for pattern, template, header in _re_pattern_algorithm_header: + if pattern.search(line): + required[header] = (linenum, template) + + # The following function is just a speed up, no semantics are changed. + if not '<' in line: # Reduces the cpu time usage by skipping lines. + continue + + for pattern, template, header in _re_pattern_templates: + if pattern.search(line): + required[header] = (linenum, template) + + # The policy is that if you #include something in foo.h you don't need to + # include it again in foo.cc. Here, we will look at possible includes. + # Let's copy the include_state so it is only messed up within this function. + include_state = include_state.copy() + + # Did we find the header for this file (if any) and successfully load it? + header_found = False + + # Use the absolute path so that matching works properly. + abs_filename = FileInfo(filename).FullName() + + # For Emacs's flymake. + # If cpplint is invoked from Emacs's flymake, a temporary file is generated + # by flymake and that file name might end with '_flymake.cc'. In that case, + # restore original file name here so that the corresponding header file can be + # found. + # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' + # instead of 'foo_flymake.h' + abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) + + # include_state is modified during iteration, so we iterate over a copy of + # the keys. + header_keys = include_state.keys() + for header in header_keys: + (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) + fullpath = common_path + header + if same_module and UpdateIncludeState(fullpath, include_state, io): + header_found = True + + # If we can't find the header file for a .cc, assume it's because we don't + # know where to look. In that case we'll give up as we're not sure they + # didn't include it in the .h file. + # TODO(unknown): Do a better job of finding .h files so we are confident that + # not having the .h file means there isn't one. + if filename.endswith('.cc') and not header_found: + return + + # All the lines have been processed, report the errors found. + for required_header_unstripped in required: + template = required[required_header_unstripped][1] + if required_header_unstripped.strip('<>"') not in include_state: + error(filename, required[required_header_unstripped][0], + 'build/include_what_you_use', 4, + 'Add #include ' + required_header_unstripped + ' for ' + template) + + +_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') + + +def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): + """Check that make_pair's template arguments are deduced. + + G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are + specified explicitly, and such use isn't intended in any case. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) + if match: + error(filename, linenum, 'build/explicit_make_pair', + 4, # 4 = high confidence + 'For C++11-compatibility, omit template arguments from make_pair' + ' OR use pair directly OR if appropriate, construct a pair directly') + + +def ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions=[]): + """Processes a single line in the file. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + clean_lines: An array of strings, each representing a line of the file, + with comments stripped. + line: Number of line being processed. + include_state: An _IncludeState instance in which the headers are inserted. + function_state: A _FunctionState instance which counts function lines, etc. + nesting_state: A _NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[line], line, error) + nesting_state.Update(filename, clean_lines, line, error) + if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM: + return + CheckForFunctionLengths(filename, clean_lines, line, function_state, error) + CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) + CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) + CheckLanguage(filename, clean_lines, line, file_extension, include_state, + nesting_state, error) + CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) + CheckForNonStandardConstructs(filename, clean_lines, line, + nesting_state, error) + CheckVlogArguments(filename, clean_lines, line, error) + CheckPosixThreading(filename, clean_lines, line, error) + CheckInvalidIncrement(filename, clean_lines, line, error) + CheckMakePairUsesDeduction(filename, clean_lines, line, error) + for check_fn in extra_check_functions: + check_fn(filename, clean_lines, line, error) + +def ProcessFileData(filename, file_extension, lines, error, + extra_check_functions=[]): + """Performs lint checks and reports any errors to the given error function. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + lines: An array of strings, each representing a line of the file, with the + last element being empty if the file is terminated with a newline. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + lines = (['// marker so line numbers and indices both start at 1'] + lines + + ['// marker so line numbers end in a known way']) + + include_state = _IncludeState() + function_state = _FunctionState() + nesting_state = _NestingState() + + ResetNolintSuppressions() + + CheckForCopyright(filename, lines, error) + + if file_extension == 'h': + CheckForHeaderGuard(filename, lines, error) + + RemoveMultiLineComments(filename, lines, error) + clean_lines = CleansedLines(lines) + for line in xrange(clean_lines.NumLines()): + ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions) + nesting_state.CheckCompletedBlocks(filename, error) + + CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) + + # We check here rather than inside ProcessLine so that we see raw + # lines rather than "cleaned" lines. + CheckForBadCharacters(filename, lines, error) + + CheckForNewlineAtEOF(filename, lines, error) + +def ProcessFile(filename, vlevel, extra_check_functions=[]): + """Does google-lint on a single file. + + Args: + filename: The name of the file to parse. + + vlevel: The level of errors to report. Every error of confidence + >= verbose_level will be reported. 0 is a good default. + + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + + _SetVerboseLevel(vlevel) + + try: + # Support the UNIX convention of using "-" for stdin. Note that + # we are not opening the file with universal newline support + # (which codecs doesn't support anyway), so the resulting lines do + # contain trailing '\r' characters if we are reading a file that + # has CRLF endings. + # If after the split a trailing '\r' is present, it is removed + # below. If it is not expected to be present (i.e. os.linesep != + # '\r\n' as in Windows), a warning is issued below if this file + # is processed. + + if filename == '-': + lines = codecs.StreamReaderWriter(sys.stdin, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace').read().split('\n') + else: + lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') + + carriage_return_found = False + # Remove trailing '\r'. + for linenum in range(len(lines)): + if lines[linenum].endswith('\r'): + lines[linenum] = lines[linenum].rstrip('\r') + carriage_return_found = True + + except IOError: + sys.stderr.write( + "Skipping input '%s': Can't open for reading\n" % filename) + return + + # Note, if no dot is found, this will give the entire filename as the ext. + file_extension = filename[filename.rfind('.') + 1:] + + # When reading from stdin, the extension is unknown, so no cpplint tests + # should rely on the extension. + if filename != '-' and file_extension not in _valid_extensions: + sys.stderr.write('Ignoring %s; not a valid file name ' + '(%s)\n' % (filename, ', '.join(_valid_extensions))) + else: + ProcessFileData(filename, file_extension, lines, Error, + extra_check_functions) + if carriage_return_found and os.linesep != '\r\n': + # Use 0 for linenum since outputting only one error for potentially + # several lines. + Error(filename, 0, 'whitespace/newline', 1, + 'One or more unexpected \\r (^M) found;' + 'better to use only a \\n') + + sys.stderr.write('Done processing %s\n' % filename) + + +def PrintUsage(message): + """Prints a brief usage string and exits, optionally with an error message. + + Args: + message: The optional error message. + """ + sys.stderr.write(_USAGE) + if message: + sys.exit('\nFATAL ERROR: ' + message) + else: + sys.exit(1) + + +def PrintCategories(): + """Prints a list of all the error-categories used by error messages. + + These are the categories used to filter messages via --filter. + """ + sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) + sys.exit(0) + + +def ParseArguments(args): + """Parses the command line arguments. + + This may set the output format and verbosity level as side-effects. + + Args: + args: The command line arguments: + + Returns: + The list of filenames to lint. + """ + try: + (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', + 'counting=', + 'filter=', + 'root=', + 'linelength=', + 'extensions=']) + except getopt.GetoptError: + PrintUsage('Invalid arguments.') + + verbosity = _VerboseLevel() + output_format = _OutputFormat() + filters = '' + counting_style = '' + + for (opt, val) in opts: + if opt == '--help': + PrintUsage(None) + elif opt == '--output': + if val not in ('emacs', 'vs7', 'eclipse'): + PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') + output_format = val + elif opt == '--verbose': + verbosity = int(val) + elif opt == '--filter': + filters = val + if not filters: + PrintCategories() + elif opt == '--counting': + if val not in ('total', 'toplevel', 'detailed'): + PrintUsage('Valid counting options are total, toplevel, and detailed') + counting_style = val + elif opt == '--root': + global _root + _root = val + elif opt == '--linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + PrintUsage('Line length must be digits.') + elif opt == '--extensions': + global _valid_extensions + try: + _valid_extensions = set(val.split(',')) + except ValueError: + PrintUsage('Extensions must be comma separated list.') + + if not filenames: + PrintUsage('No files were specified.') + + _SetOutputFormat(output_format) + _SetVerboseLevel(verbosity) + _SetFilters(filters) + _SetCountingStyle(counting_style) + + return filenames + + +def main(): + filenames = ParseArguments(sys.argv[1:]) + + # Change stderr to write with replacement characters so we don't die + # if we try to print something containing non-ASCII characters. + sys.stderr = codecs.StreamReaderWriter(sys.stderr, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace') + + _cpplint_state.ResetErrorCounts() + for filename in filenames: + ProcessFile(filename, _cpplint_state.verbose_level) + _cpplint_state.PrintErrorCounts() + + sys.exit(_cpplint_state.error_count > 0) + + +if __name__ == '__main__': + main() diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php new file mode 100644 index 00000000..cb9cf9bd --- /dev/null +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -0,0 +1,147 @@ +getPaths(); + + // Remove all deleted files, which are not checked by the + // following linters. + foreach ($paths as $key => $path) { + if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) { + unset($paths[$key]); + } + } + + $generated_linter = new ArcanistGeneratedLinter(); + $linters[] = $generated_linter; + + $nolint_linter = new ArcanistNoLintLinter(); + $linters[] = $nolint_linter; + + $text_linter = new ArcanistTextLinter(); + $text_linter->setCustomSeverityMap(array( + ArcanistTextLinter::LINT_LINE_WRAP + => ArcanistLintSeverity::SEVERITY_ADVICE, + )); + $linters[] = $text_linter; + + $java_text_linter = new ArcanistTextLinter(); + $java_text_linter->setMaxLineLength(100); + $java_text_linter->setCustomSeverityMap(array( + ArcanistTextLinter::LINT_LINE_WRAP + => ArcanistLintSeverity::SEVERITY_ADVICE, + )); + $linters[] = $java_text_linter; + + $pep8_options = $this->getPEP8WithTextOptions().',E302'; + + $python_linter = new ArcanistPEP8Linter(); + $python_linter->setConfig(array('options' => $pep8_options)); + $linters[] = $python_linter; + + $python_2space_linter = new ArcanistPEP8Linter(); + $python_2space_linter->setConfig(array('options' => $pep8_options.',E111')); + $linters[] = $python_2space_linter; + + // Currently we can't run cpplint in commit hook mode, because it + // depends on having access to the working directory. + if (!$this->getCommitHookMode()) { + $cpp_linters = array(); + $google_linter = new ArcanistCpplintLinter(); + $google_linter->setConfig(array( + 'lint.cpplint.prefix' => '', + 'lint.cpplint.bin' => 'cpplint', + )); + $cpp_linters[] = $linters[] = $google_linter; + $cpp_linters[] = $linters[] = new FbcodeCppLinter(); + $cpp_linters[] = $linters[] = new PfffCppLinter(); + } + + $spelling_linter = new ArcanistSpellingLinter(); + $linters[] = $spelling_linter; + + foreach ($paths as $path) { + $is_text = false; + + $text_extensions = ( + '/\.('. + 'cpp|cxx|c|cc|h|hpp|hxx|tcc|'. + 'py|rb|hs|pl|pm|tw|'. + 'php|phpt|css|js|'. + 'java|'. + 'thrift|'. + 'lua|'. + 'siv|'. + 'txt'. + ')$/' + ); + if (preg_match($text_extensions, $path)) { + $is_text = true; + } + if ($is_text) { + $nolint_linter->addPath($path); + + $generated_linter->addPath($path); + $generated_linter->addData($path, $this->loadData($path)); + + if (preg_match('/\.java$/', $path)) { + $java_text_linter->addPath($path); + $java_text_linter->addData($path, $this->loadData($path)); + } else { + $text_linter->addPath($path); + $text_linter->addData($path, $this->loadData($path)); + } + + $spelling_linter->addPath($path); + $spelling_linter->addData($path, $this->loadData($path)); + } + if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) { + foreach ($cpp_linters as &$linter) { + $linter->addPath($path); + $linter->addData($path, $this->loadData($path)); + } + } + + // Match *.py and contbuild config files + if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/', + $path)) { + $space_count = 4; + $real_path = $this->getFilePathOnDisk($path); + $dir = dirname($real_path); + do { + if (file_exists($dir.'/.python2space')) { + $space_count = 2; + break; + } + $dir = dirname($dir); + } while ($dir != '/' && $dir != '.'); + + if ($space_count == 4) { + $cur_path_linter = $python_linter; + } else { + $cur_path_linter = $python_2space_linter; + } + $cur_path_linter->addPath($path); + $cur_path_linter->addData($path, $this->loadData($path)); + + if (preg_match('/\.tw$/', $path)) { + $cur_path_linter->setCustomSeverityMap(array( + 'E251' => ArcanistLintSeverity::SEVERITY_DISABLED, + )); + } + } + } + + $name_linter = new ArcanistFilenameLinter(); + $linters[] = $name_linter; + foreach ($paths as $path) { + $name_linter->addPath($path); + } + + return $linters; + } + +} diff --git a/port/README b/port/README new file mode 100644 index 00000000..422563e2 --- /dev/null +++ b/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h new file mode 100644 index 00000000..db3580bd --- /dev/null +++ b/port/atomic_pointer.h @@ -0,0 +1,157 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// AtomicPointer provides storage for a lock-free pointer. +// Platform-dependent implementation of AtomicPointer: +// - If the platform provides a cheap barrier, we use it with raw pointers +// - If cstdatomic is present (on newer versions of gcc, it is), we use +// a cstdatomic-based AtomicPointer. However we prefer the memory +// barrier based version, because at least on a gcc 4.4 32-bit build +// on linux, we have encountered a buggy +// implementation. Also, some implementations are much +// slower than a memory-barrier based implementation (~16ns for +// based acquire-load vs. ~1ns for a barrier based +// acquire-load). +// This code is based on atomicops-internals-* in Google's perftools: +// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase + +#ifndef PORT_ATOMIC_POINTER_H_ +#define PORT_ATOMIC_POINTER_H_ + +#include +#ifdef ROCKSDB_ATOMIC_PRESENT +#include +#endif +#ifdef OS_WIN +#include +#endif +#ifdef OS_MACOSX +#include +#endif + +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) +#define ARCH_CPU_X86_FAMILY 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#endif + +namespace rocksdb { +namespace port { + +// Define MemoryBarrier() if available +// Windows on x86 +#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) +// windows.h already provides a MemoryBarrier(void) macro +// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Gcc on x86 +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + __asm__ __volatile__("" : : : "memory"); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Sun Studio +#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC) +inline void MemoryBarrier() { + // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on + // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. + asm volatile("" : : : "memory"); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// Mac OS +#elif defined(OS_MACOSX) +inline void MemoryBarrier() { + OSMemoryBarrier(); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +// ARM Linux +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +// The Linux ARM kernel provides a highly optimized device-specific memory +// barrier function at a fixed memory address that is mapped in every +// user-level process. +// +// This beats using CPU-specific instructions which are, on single-core +// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more +// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking +// shows that the extra function call cost is completely negligible on +// multi-core devices. +// +inline void MemoryBarrier() { + (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)(); +} +#define ROCKSDB_HAVE_MEMORY_BARRIER + +#endif + +// AtomicPointer built using platform-specific MemoryBarrier() +#if defined(ROCKSDB_HAVE_MEMORY_BARRIER) +class AtomicPointer { + private: + void* rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* p) : rep_(p) {} + inline void* NoBarrier_Load() const { return rep_; } + inline void NoBarrier_Store(void* v) { rep_ = v; } + inline void* Acquire_Load() const { + void* result = rep_; + MemoryBarrier(); + return result; + } + inline void Release_Store(void* v) { + MemoryBarrier(); + rep_ = v; + } +}; + +// AtomicPointer based on +#elif defined(ROCKSDB_ATOMIC_PRESENT) +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +// We have neither MemoryBarrier(), nor +#else +#error Please implement AtomicPointer for this platform. + +#endif + +#undef ROCKSDB_HAVE_MEMORY_BARRIER +#undef ARCH_CPU_X86_FAMILY +#undef ARCH_CPU_ARM_FAMILY + +} // namespace port +} // namespace rocksdb + +#endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/likely.h b/port/likely.h new file mode 100644 index 00000000..ede0df5a --- /dev/null +++ b/port/likely.h @@ -0,0 +1,21 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef PORT_LIKELY_H_ +#define PORT_LIKELY_H_ + +#if defined(__GNUC__) && __GNUC__ >= 4 +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +#endif // PORT_LIKELY_H_ diff --git a/port/port.h b/port/port.h new file mode 100644 index 00000000..2dc9a0fa --- /dev/null +++ b/port/port.h @@ -0,0 +1,22 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_PORT_PORT_H_ +#define STORAGE_LEVELDB_PORT_PORT_H_ + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(ROCKSDB_PLATFORM_POSIX) +# include "port/port_posix.h" +#endif + +#endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_example.h b/port/port_example.h new file mode 100644 index 00000000..64a57918 --- /dev/null +++ b/port/port_example.h @@ -0,0 +1,140 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ + +namespace rocksdb { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// Thread-safe initialization. +// Used as follows: +// static port::OnceType init_control = LEVELDB_ONCE_INIT; +// static void Initializer() { ... do something ...; } +// ... +// port::InitOnce(&init_control, &Initializer); +typedef intptr_t OnceType; +#define LEVELDB_ONCE_INIT 0 +extern void InitOnce(port::OnceType*, void (*initializer)()); + +// A type that holds a pointer that can be read or written atomically +// (i.e., without word-tearing.) +class AtomicPointer { + private: + intptr_t rep_; + public: + // Initialize to arbitrary value + AtomicPointer(); + + // Initialize to hold v + explicit AtomicPointer(void* v) : rep_(v) { } + + // Read and return the stored pointer with the guarantee that no + // later memory access (read or write) by this thread can be + // reordered ahead of this read. + void* Acquire_Load() const; + + // Set v as the stored pointer with the guarantee that no earlier + // memory access (read or write) by this thread can be reordered + // after this store. + void Release_Store(void* v); + + // Read the stored pointer with no ordering guarantees. + void* NoBarrier_Load() const; + + // Set va as the stored pointer with no ordering guarantees. + void NoBarrier_Store(void* v); +}; + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// If input[0,input_length-1] looks like a valid snappy compressed +// buffer, store the size of the uncompressed data in *result and +// return true. Else return false. +extern bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +// +// REQUIRES: at least the first "n" bytes of output[] must be writable +// where "n" is the result of a successful call to +// Snappy_GetUncompressedLength. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + char* output); + +// ------------------ Miscellaneous ------------------- + +// If heap profiling is not supported, returns false. +// Else repeatedly calls (*func)(arg, data, n) and then returns true. +// The concatenation of all "data[0,n-1]" fragments is the heap profile. +extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); + +} // namespace port +} // namespace rocksdb + +#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff --git a/port/port_posix.cc b/port/port_posix.cc new file mode 100644 index 00000000..911cebdf --- /dev/null +++ b/port/port_posix.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#include +#include +#include +#include "util/logging.h" + +namespace rocksdb { +namespace port { + +static void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } +} + +Mutex::Mutex(bool adaptive) { +#ifdef OS_LINUX + if (!adaptive) { + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); + } else { + pthread_mutexattr_t mutex_attr; + PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); + PthreadCall("set mutex attr", + pthread_mutexattr_settype(&mutex_attr, + PTHREAD_MUTEX_ADAPTIVE_NP)); + PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); + PthreadCall("destroy mutex attr", + pthread_mutexattr_destroy(&mutex_attr)); + } +#else // ignore adaptive for non-linux platform + PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); +#endif // OS_LINUX +} + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); +#ifndef NDEBUG + locked_ = true; +#endif +} + +void Mutex::Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void Mutex::AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif +} + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, NULL)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { +#ifndef NDEBUG + mu_->locked_ = false; +#endif + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +#ifndef NDEBUG + mu_->locked_ = true; +#endif +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, NULL)); } + +RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } + +void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } + +void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } + +void RWMutex::Unlock() { PthreadCall("unlock", pthread_rwlock_unlock(&mu_)); } + +void InitOnce(OnceType* once, void (*initializer)()) { + PthreadCall("once", pthread_once(once, initializer)); +} + +} // namespace port +} // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h new file mode 100644 index 00000000..6a738292 --- /dev/null +++ b/port/port_posix.h @@ -0,0 +1,488 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_ +#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_ + +#undef PLATFORM_IS_LITTLE_ENDIAN +#if defined(OS_MACOSX) + #include + #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) + #define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) + #endif +#elif defined(OS_SOLARIS) + #include + #ifdef _LITTLE_ENDIAN + #define PLATFORM_IS_LITTLE_ENDIAN true + #else + #define PLATFORM_IS_LITTLE_ENDIAN false + #endif +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) ||\ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) + #include + #include +#else + #include +#endif +#include +#ifdef SNAPPY +#include +#endif + +#ifdef ZLIB +#include +#endif + +#ifdef BZIP2 +#include +#endif + +#if defined(LZ4) +#include +#include +#endif + +#include +#include +#include +#include "rocksdb/options.h" +#include "port/atomic_pointer.h" + +#ifndef PLATFORM_IS_LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#endif + +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ + defined(OS_ANDROID) +// Use fread/fwrite/fflush on platforms without _unlocked variants +#define fread_unlocked fread +#define fwrite_unlocked fwrite +#define fflush_unlocked fflush +#endif + +#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ + defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +// Use fsync() on platforms without fdatasync() +#define fdatasync fsync +#endif + +#if defined(OS_ANDROID) && __ANDROID_API__ < 9 +// fdatasync() was only introduced in API level 9 on Android. Use fsync() +// when targetting older platforms. +#define fdatasync fsync +#endif + +namespace rocksdb { +namespace port { + +static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + /* implicit */ Mutex(bool adaptive = false); + ~Mutex(); + + void Lock(); + void Unlock(); + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld(); + + private: + friend class CondVar; + pthread_mutex_t mu_; +#ifndef NDEBUG + bool locked_; +#endif + + // No copying + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +class RWMutex { + public: + RWMutex(); + ~RWMutex(); + + void ReadLock(); + void WriteLock(); + void Unlock(); + void AssertHeld() { } + + private: + pthread_rwlock_t mu_; // the underlying platform mutex + + // No copying allowed + RWMutex(const RWMutex&); + void operator=(const RWMutex&); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +typedef pthread_once_t OnceType; +#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT +extern void InitOnce(OnceType* once, void (*initializer)()); + +inline bool Snappy_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#endif + + return false; +} + +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { +#ifdef SNAPPY + return snappy::GetUncompressedLength(input, length, result); +#else + return false; +#endif +} + +inline bool Snappy_Uncompress(const char* input, size_t length, + char* output) { +#ifdef SNAPPY + return snappy::RawUncompress(input, length, output); +#else + return false; +#endif +} + +inline bool Zlib_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef ZLIB + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits, + memLevel, opts.strategy); + if (st != Z_OK) { + return false; + } + + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(length); + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef *)input; + _stream.avail_in = length; + + // Initialize the output size. + _stream.avail_out = length; + _stream.next_out = (Bytef *)&(*output)[0]; + + int old_sz =0, new_sz =0, new_sz_delta =0; + bool done = false; + while (!done) { + int st = deflate(&_stream, Z_FINISH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. Increase the output space by 20%. + // (Should we fail the compression since it expands the size?) + old_sz = output->size(); + new_sz_delta = (int)(output->size() * 0.2); + new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta); + output->resize(new_sz); + // Set more output. + _stream.next_out = (Bytef *)&(*output)[old_sz]; + _stream.avail_out = new_sz - old_sz; + break; + case Z_BUF_ERROR: + default: + deflateEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out); + deflateEnd(&_stream); + return true; +#endif + return false; +} + +inline char* Zlib_Uncompress(const char* input_data, size_t input_length, + int* decompress_size, int windowBits = -14) { +#ifdef ZLIB + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = inflateInit2(&_stream, + windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + _stream.next_in = (Bytef *)input_data; + _stream.avail_in = input_length; + + // Assume the decompressed data size will 5x of compressed size. + int output_len = input_length * 5; + char* output = new char[output_len]; + int old_sz = output_len; + + _stream.next_out = (Bytef *)output; + _stream.avail_out = output_len; + + char* tmp = nullptr; + int output_len_delta; + bool done = false; + + //while(_stream.next_in != nullptr && _stream.avail_in != 0) { + while (!done) { + int st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. Increase the output space by 20%. + old_sz = output_len; + output_len_delta = (int)(output_len * 0.2); + output_len += output_len_delta < 10 ? 10 : output_len_delta; + tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (Bytef *)(output + old_sz); + _stream.avail_out = output_len - old_sz; + break; + case Z_BUF_ERROR: + default: + delete[] output; + inflateEnd(&_stream); + return nullptr; + } + } + + *decompress_size = output_len - _stream.avail_out; + inflateEnd(&_stream); + return output; +#endif + + return nullptr; +} + +inline bool BZip2_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef BZIP2 + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(length); + + // Compress the input, and put compressed data in output. + _stream.next_in = (char *)input; + _stream.avail_in = length; + + // Initialize the output size. + _stream.next_out = (char *)&(*output)[0]; + _stream.avail_out = length; + + int old_sz =0, new_sz =0; + while(_stream.next_in != nullptr && _stream.avail_in != 0) { + int st = BZ2_bzCompress(&_stream, BZ_FINISH); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_FINISH_OK: + // No output space. Increase the output space by 20%. + // (Should we fail the compression since it expands the size?) + old_sz = output->size(); + new_sz = (int)(output->size() * 1.2); + output->resize(new_sz); + // Set more output. + _stream.next_out = (char *)&(*output)[old_sz]; + _stream.avail_out = new_sz - old_sz; + break; + case BZ_SEQUENCE_ERROR: + default: + BZ2_bzCompressEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out); + BZ2_bzCompressEnd(&_stream); + return true; +#endif + return false; +} + +inline char* BZip2_Uncompress(const char* input_data, size_t input_length, + int* decompress_size) { +#ifdef BZIP2 + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char *)input_data; + _stream.avail_in = input_length; + + // Assume the decompressed data size will be 5x of compressed size. + int output_len = input_length * 5; + char* output = new char[output_len]; + int old_sz = output_len; + + _stream.next_out = (char *)output; + _stream.avail_out = output_len; + + char* tmp = nullptr; + + while(_stream.next_in != nullptr && _stream.avail_in != 0) { + int st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_OK: + // No output space. Increase the output space by 20%. + old_sz = output_len; + output_len = (int)(output_len * 1.2); + tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (char *)(output + old_sz); + _stream.avail_out = output_len - old_sz; + break; + default: + delete[] output; + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + *decompress_size = output_len - _stream.avail_out; + BZ2_bzDecompressEnd(&_stream); + return output; +#endif + return nullptr; +} + +inline bool LZ4_Compress(const CompressionOptions &opts, const char *input, + size_t length, ::std::string* output) { +#ifdef LZ4 + int compressBound = LZ4_compressBound(length); + output->resize(8 + compressBound); + char *p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + size_t outlen; + outlen = LZ4_compress_limitedOutput(input, p + 8, length, compressBound); + if (outlen == 0) { + return false; + } + output->resize(8 + outlen); + return true; +#endif + return false; +} + +inline char* LZ4_Uncompress(const char* input_data, size_t input_length, + int* decompress_size) { +#ifdef LZ4 + if (input_length < 8) { + return nullptr; + } + int output_len; + memcpy(&output_len, input_data, sizeof(output_len)); + char *output = new char[output_len]; + *decompress_size = LZ4_decompress_safe_partial( + input_data + 8, output, input_length - 8, output_len, output_len); + if (*decompress_size < 0) { + delete[] output; + return nullptr; + } + return output; +#endif + return nullptr; +} + +inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + int compressBound = LZ4_compressBound(length); + output->resize(8 + compressBound); + char *p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + size_t outlen; + outlen = LZ4_compressHC2_limitedOutput(input, p + 8, length, compressBound, + opts.level); + if (outlen == 0) { + return false; + } + output->resize(8 + outlen); + return true; +#endif + return false; +} + +inline bool GetHeapProfile(void (*func)(void *, const char *, int), void *arg) { + return false; +} + +#define CACHE_LINE_SIZE 64U + +} // namespace port +} // namespace rocksdb + +#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_ diff --git a/port/stack_trace.cc b/port/stack_trace.cc new file mode 100644 index 00000000..aa01fd0c --- /dev/null +++ b/port/stack_trace.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/stack_trace.h" + +#ifdef OS_LINUX + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +static const char* GetExecutableName() +{ + static char name[1024]; + + char link[1024]; + snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); + auto read = readlink(link, name, sizeof(name)); + if (-1 == read) { + return nullptr; + } else { + name[read] = 0; + return name; + } +} + +void PrintStack(int first_frames_to_skip) { + const int kMaxFrames = 100; + void *frames[kMaxFrames]; + + auto num_frames = backtrace(frames, kMaxFrames); + auto symbols = backtrace_symbols(frames, num_frames); + + auto executable = GetExecutableName(); + + for (int i = first_frames_to_skip; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i - first_frames_to_skip); + if (symbols) { + fprintf(stderr, "%s ", symbols[i]); + } + if (executable) { + // out source to addr2line, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + sprintf(cmd, "addr2line %p -e %s -f -C 2>&1", frames[i], executable); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } + } else { + fprintf(stderr, " %p", frames[i]); + } + fprintf(stderr, "\n"); + } +} + +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); + // re-signal to default handler (so we still get core dump if needed...) + raise(sig); +} + +void InstallStackTraceHandler() { + // just use the plain old signal as it's simple and sufficient + // for this use case + signal(SIGILL, StackTraceHandler); + signal(SIGSEGV, StackTraceHandler); + signal(SIGBUS, StackTraceHandler); + signal(SIGABRT, StackTraceHandler); + + printf("Installed stack trace handler for SIGILL SIGSEGV SIGBUS SIGABRT\n"); + +} + +} // namespace rocksdb + +#else // no-op for non-linux system for now + +namespace rocksdb { + +void InstallStackTraceHandler() {} +void PrintStack(int first_frames_to_skip) {} + +} + +#endif // OS_LINUX diff --git a/port/win/stdint.h b/port/win/stdint.h new file mode 100644 index 00000000..39edd0db --- /dev/null +++ b/port/win/stdint.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// MSVC didn't ship with this file until the 2010 version. + +#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_ +#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_ + +#if !defined(_MSC_VER) +#error This file should only be included when compiling with MSVC. +#endif + +// Define C99 equivalent types. +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long long int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +#endif // STORAGE_LEVELDB_PORT_WIN_STDINT_H_ diff --git a/table/block.cc b/table/block.cc new file mode 100644 index 00000000..6a6751ca --- /dev/null +++ b/table/block.cc @@ -0,0 +1,307 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" + +#include +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "table/block_hash_index.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +uint32_t Block::NumRestarts() const { + assert(size_ >= 2*sizeof(uint32_t)); + return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); +} + +Block::Block(const BlockContents& contents) + : data_(contents.data.data()), + size_(contents.data.size()), + owned_(contents.heap_allocated), + cachable_(contents.cachable), + compression_type_(contents.compression_type) { + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + } +} + +Block::~Block() { + if (owned_) { + delete[] data_; + } +} + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns nullptr. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +static inline const char* DecodeEntry(const char* p, const char* limit, + uint32_t* shared, + uint32_t* non_shared, + uint32_t* value_length) { + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; + } + + if (static_cast(limit - p) < (*non_shared + *value_length)) { + return nullptr; + } + return p; +} + +class Block::Iter : public Iterator { + private: + const Comparator* const comparator_; + const char* const data_; // underlying block contents + uint32_t const restarts_; // Offset of restart array (list of fixed32) + uint32_t const num_restarts_; // Number of uint32_t entries in restart array + + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + uint32_t restart_index_; // Index of restart block in which current_ falls + std::string key_; + Slice value_; + Status status_; + BlockHashIndex* hash_index_; + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + return (value_.data() + value_.size()) - data_; + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + public: + Iter(const Comparator* comparator, const char* data, uint32_t restarts, + uint32_t num_restarts, BlockHashIndex* hash_index) + : comparator_(comparator), + data_(data), + restarts_(restarts), + num_restarts_(num_restarts), + current_(restarts_), + restart_index_(num_restarts_), + hash_index_(hash_index) { + assert(num_restarts_ > 0); + } + + virtual bool Valid() const { return current_ < restarts_; } + virtual Status status() const { return status_; } + virtual Slice key() const { + assert(Valid()); + return key_; + } + virtual Slice value() const { + assert(Valid()); + return value_; + } + + virtual void Next() { + assert(Valid()); + ParseNextKey(); + } + + virtual void Prev() { + assert(Valid()); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + do { + // Loop until end of current entry hits the start of original entry + } while (ParseNextKey() && NextEntryOffset() < original); + } + + virtual void Seek(const Slice& target) { + uint32_t index = 0; + bool ok = hash_index_ ? HashSeek(target, &index) + : BinarySeek(target, 0, num_restarts_ - 1, &index); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + // Linear search (within restart block) for first key >= target + + while (true) { + if (!ParseNextKey() || Compare(key_, target) >= 0) { + return; + } + } + } + virtual void SeekToFirst() { + SeekToRestartPoint(0); + ParseNextKey(); + } + + virtual void SeekToLast() { + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } + } + + private: + void CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.clear(); + value_.clear(); + } + + bool ParseNextKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || key_.size() < shared) { + CorruptionError(); + return false; + } else { + key_.resize(shared); + key_.append(p, non_shared); + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } + } + // Binary search in restart array to find the first restart point + // with a key >= target + bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index) { + assert(left <= right); + + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared, value_length; + const char* key_ptr = + DecodeEntry(data_ + region_offset, data_ + restarts_, &shared, + &non_shared, &value_length); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + Slice mid_key(key_ptr, non_shared); + if (Compare(mid_key, target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + *index = left; + return true; + } + + bool HashSeek(const Slice& target, uint32_t* index) { + assert(hash_index_); + auto restart_index = hash_index_->GetRestartIndex(target); + if (restart_index == nullptr) { + current_ = restarts_; + return 0; + } + + // the elements in restart_array[index : index + num_blocks] + // are all with same prefix. We'll do binary search in that small range. + auto left = restart_index->first_index; + auto right = restart_index->first_index + restart_index->num_blocks - 1; + return BinarySeek(target, left, right, index); + } +}; + +Iterator* Block::NewIterator(const Comparator* cmp) { + if (size_ < 2*sizeof(uint32_t)) { + return NewErrorIterator(Status::Corruption("bad block contents")); + } + const uint32_t num_restarts = NumRestarts(); + if (num_restarts == 0) { + return NewEmptyIterator(); + } else { + return new Iter(cmp, data_, restart_offset_, num_restarts, + hash_index_.get()); + } +} + +void Block::SetBlockHashIndex(BlockHashIndex* hash_index) { + hash_index_.reset(hash_index); +} + +} // namespace rocksdb diff --git a/table/block.h b/table/block.h new file mode 100644 index 00000000..b363d62f --- /dev/null +++ b/table/block.h @@ -0,0 +1,61 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +struct BlockContents; +class Comparator; +class BlockHashIndex; + +class Block { + public: + // Initialize the block with the specified contents. + explicit Block(const BlockContents& contents); + + ~Block(); + + size_t size() const { return size_; } + const char* data() const { return data_; } + bool cachable() const { return cachable_; } + uint32_t NumRestarts() const; + CompressionType compression_type() const { return compression_type_; } + + // If hash index lookup is enabled and `use_hash_index` is true. This block + // will do hash lookup for the key prefix. + // + // NOTE: for the hash based lookup, if a key prefix doesn't match any key, + // the iterator will simply be set as "invalid", rather than returning + // the key that is just pass the target key. + Iterator* NewIterator(const Comparator* comparator); + void SetBlockHashIndex(BlockHashIndex* hash_index); + + private: + const char* data_; + size_t size_; + uint32_t restart_offset_; // Offset in data_ of restart array + bool owned_; // Block owns data_[] + bool cachable_; + CompressionType compression_type_; + std::unique_ptr hash_index_; + + // No copying allowed + Block(const Block&); + void operator=(const Block&); + + class Iter; +}; + +} // namespace rocksdb diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc new file mode 100644 index 00000000..17e9367a --- /dev/null +++ b/table/block_based_table_builder.cc @@ -0,0 +1,654 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_builder.h" + +#include +#include +#include + +#include +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" + +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +namespace { + +typedef BlockBasedTableOptions::IndexType IndexType; + +// The interface for building index. +// Instruction for adding a new concrete IndexBuilder: +// 1. Create a subclass instantiated from IndexBuilder. +// 2. Add a new entry associated with that subclass in TableOptions::IndexType. +// 3. Add a create function for the new subclass in CreateIndexBuilder. +// Note: we can devise more advanced design to simplify the process for adding +// new subclass, which will, on the other hand, increase the code complexity and +// catch unwanted attention from readers. Given that we won't add/change +// indexes frequently, it makes sense to just embrace a more straightforward +// design that just works. +class IndexBuilder { + public: + explicit IndexBuilder(const Comparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexBuilder() {} + + // Add a new index entry to index block. + // To allow further optimization, we provide `last_key_in_current_block` and + // `first_key_in_next_block`, based on which the specific implementation can + // determine the best index key to be used for the index block. + // @last_key_in_current_block: this parameter maybe overridden with the value + // "substitute key". + // @first_key_in_next_block: it will be nullptr if the entry being added is + // the last one in the table + // + // REQUIRES: Finish() has not yet been called. + virtual void AddEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) = 0; + + // Inform the index builder that all entries has been written. Block builder + // may therefore perform any operation required for block finalization. + // + // REQUIRES: Finish() has not yet been called. + virtual Slice Finish() = 0; + + // Get the estimated size for index block. + virtual size_t EstimatedSize() const = 0; + + protected: + const Comparator* comparator_; +}; + +// This index builder builds space-efficient index block. +// +// Optimizations: +// 1. Made block's `block_restart_interval` to be 1, which will avoid linear +// search when doing index lookup. +// 2. Shorten the key length for index block. Other than honestly using the +// last key in the data block as the index key, we instead find a shortest +// substitute key that serves the same function. +class ShortenedIndexBuilder : public IndexBuilder { + public: + explicit ShortenedIndexBuilder(const Comparator* comparator) + : IndexBuilder(comparator), + index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {} + + virtual void AddEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + if (first_key_in_next_block != nullptr) { + comparator_->FindShortestSeparator(last_key_in_current_block, + *first_key_in_next_block); + } else { + comparator_->FindShortSuccessor(last_key_in_current_block); + } + + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + index_block_builder_.Add(*last_key_in_current_block, handle_encoding); + } + + virtual Slice Finish() override { return index_block_builder_.Finish(); } + + virtual size_t EstimatedSize() const { + return index_block_builder_.CurrentSizeEstimate(); + } + + private: + BlockBuilder index_block_builder_; +}; + +// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like +// ShortenedIndexBuilder, but preserves the full key instead the substitude key. +class FullKeyIndexBuilder : public IndexBuilder { + public: + explicit FullKeyIndexBuilder(const Comparator* comparator) + : IndexBuilder(comparator), + index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {} + + virtual void AddEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + index_block_builder_.Add(*last_key_in_current_block, handle_encoding); + } + + virtual Slice Finish() override { return index_block_builder_.Finish(); } + + virtual size_t EstimatedSize() const { + return index_block_builder_.CurrentSizeEstimate(); + } + + private: + BlockBuilder index_block_builder_; +}; + +// Create a index builder based on its type. +IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) { + switch (type) { + case BlockBasedTableOptions::kBinarySearch: { + return new ShortenedIndexBuilder(comparator); + } + case BlockBasedTableOptions::kHashSearch: { + return new ShortenedIndexBuilder(comparator); + } + default: { + assert(!"Do not recognize the index type "); + return nullptr; + } + } + // impossible. + assert(false); + return nullptr; +} + +bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { + // Check to see if compressed less than 12.5% + return compressed_size < raw_size - (raw_size / 8u); +} + +Slice CompressBlock(const Slice& raw, + const CompressionOptions& compression_options, + CompressionType* type, std::string* compressed_output) { + if (*type == kNoCompression) { + return raw; + } + + // Will return compressed block contents if (1) the compression method is + // supported in this platform and (2) the compression rate is "good enough". + switch (*type) { + case kSnappyCompression: + if (port::Snappy_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kZlibCompression: + if (port::Zlib_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kBZip2Compression: + if (port::BZip2_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kLZ4Compression: + if (port::LZ4_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + case kLZ4HCCompression: + if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + break; // fall back to no compression. + default: {} // Do not recognize this compression type + } + + // Compression method is not supported, or not good compression ratio, so just + // fall back to uncompressed form. + *type = kNoCompression; + return raw; +} + +} // anonymous namespace + +// kBlockBasedTableMagicNumber was picked by running +// echo http://code.google.com/p/leveldb/ | sha1sum +// and taking the leading 64 bits. +// Please note that kBlockBasedTableMagicNumber may also be accessed by +// other .cc files so it have to be explicitly declared with "extern". +extern const uint64_t kBlockBasedTableMagicNumber + = 0xdb4775248b80fb57ull; + +// A collector that collects properties of interest to block-based table. +// For now this class looks heavy-weight since we only write one additional +// property. +// But in the forseeable future, we will add more and more properties that are +// specific to block-based table. +class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector + : public TablePropertiesCollector { + public: + BlockBasedTablePropertiesCollector( + BlockBasedTableOptions::IndexType index_type) + : index_type_(index_type) {} + + virtual Status Add(const Slice& key, const Slice& value) { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual Status Finish(UserCollectedProperties* properties) { + std::string val; + PutFixed32(&val, static_cast(index_type_)); + properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); + + return Status::OK(); + } + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const { + return "BlockBasedTablePropertiesCollector"; + } + + virtual UserCollectedProperties GetReadableProperties() const { + // Intentionally left blank. + return UserCollectedProperties(); + } + + private: + BlockBasedTableOptions::IndexType index_type_; +}; + +struct BlockBasedTableBuilder::Rep { + Options options; + const InternalKeyComparator& internal_comparator; + WritableFile* file; + uint64_t offset = 0; + Status status; + BlockBuilder data_block; + std::unique_ptr index_builder; + + std::string last_key; + CompressionType compression_type; + TableProperties props; + + bool closed = false; // Either Finish() or Abandon() has been called. + FilterBlockBuilder* filter_block; + char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size; + + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + std::unique_ptr flush_block_policy; + + Rep(const Options& opt, const InternalKeyComparator& icomparator, + WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory, + CompressionType compression_type, IndexType index_block_type) + : options(opt), + internal_comparator(icomparator), + file(f), + data_block(options, &internal_comparator), + index_builder( + CreateIndexBuilder(index_block_type, &internal_comparator)), + compression_type(compression_type), + filter_block(opt.filter_policy == nullptr + ? nullptr + : new FilterBlockBuilder(opt, &internal_comparator)), + flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy( + options, data_block)) { + options.table_properties_collectors.push_back( + std::make_shared(index_block_type)); + } +}; + +BlockBasedTableBuilder::BlockBasedTableBuilder( + const Options& options, const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, WritableFile* file, + CompressionType compression_type) + : rep_(new Rep(options, internal_comparator, file, + table_options.flush_block_policy_factory.get(), + compression_type, table_options.index_type)) { + if (rep_->filter_block != nullptr) { + rep_->filter_block->StartBlock(0); + } + if (options.block_cache_compressed.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + options.block_cache_compressed.get(), file, + &rep_->compressed_cache_key_prefix[0], + &rep_->compressed_cache_key_prefix_size); + } +} + +BlockBasedTableBuilder::~BlockBasedTableBuilder() { + assert(rep_->closed); // Catch errors where caller forgot to call Finish() + delete rep_->filter_block; + delete rep_; +} + +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->props.num_entries > 0) { + assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); + } + + auto should_flush = r->flush_block_policy->Update(key, value); + if (should_flush) { + assert(!r->data_block.empty()); + Flush(); + + // Add item to index block. + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + if (ok()) { + r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle); + } + } + + if (r->filter_block != nullptr) { + r->filter_block->AddKey(key); + } + + r->last_key.assign(key.data(), key.size()); + r->data_block.Add(key, value); + r->props.num_entries++; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + + NotifyCollectTableCollectorsOnAdd( + key, + value, + r->options.table_properties_collectors, + r->options.info_log.get() + ); +} + +void BlockBasedTableBuilder::Flush() { + Rep* r = rep_; + assert(!r->closed); + if (!ok()) return; + if (r->data_block.empty()) return; + WriteBlock(&r->data_block, &r->pending_handle); + if (ok()) { + r->status = r->file->Flush(); + } + if (r->filter_block != nullptr) { + r->filter_block->StartBlock(r->offset); + } + r->props.data_size = r->offset; + ++r->props.num_data_blocks; +} + +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle) { + WriteBlock(block->Finish(), handle); + block->Reset(); +} + +void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, + BlockHandle* handle) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + + auto type = r->compression_type; + auto block_contents = + CompressBlock(raw_block_contents, r->options.compression_opts, &type, + &r->compressed_output); + WriteRawBlock(block_contents, type, handle); + r->compressed_output.clear(); +} + +void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle) { + Rep* r = rep_; + StopWatch sw(r->options.env, r->options.statistics.get(), + WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type + EncodeFixed32(trailer+1, crc32c::Mask(crc)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->status = InsertBlockInCache(block_contents, type, handle); + } + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + } + } +} + +Status BlockBasedTableBuilder::status() const { + return rep_->status; +} + +static void DeleteCachedBlock(const Slice& key, void* value) { + Block* block = reinterpret_cast(value); + delete block; +} + +// +// Make a copy of the block contents and insert into compressed block cache +// +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle) { + Rep* r = rep_; + Cache* block_cache_compressed = r->options.block_cache_compressed.get(); + + if (type != kNoCompression && block_cache_compressed != nullptr) { + + Cache::Handle* cache_handle = nullptr; + size_t size = block_contents.size(); + + char* ubuf = new char[size]; // make a new copy + memcpy(ubuf, block_contents.data(), size); + + BlockContents results; + Slice sl(ubuf, size); + results.data = sl; + results.cachable = true; // XXX + results.heap_allocated = true; + results.compression_type = type; + + Block* block = new Block(results); + + // make cache key by appending the file offset to the cache prefix id + char* end = EncodeVarint64( + r->compressed_cache_key_prefix + + r->compressed_cache_key_prefix_size, + handle->offset()); + Slice key(r->compressed_cache_key_prefix, static_cast + (end - r->compressed_cache_key_prefix)); + + // Insert into compressed block cache. + cache_handle = block_cache_compressed->Insert(key, block, block->size(), + &DeleteCachedBlock); + block_cache_compressed->Release(cache_handle); + + // Invalidate OS cache. + r->file->InvalidateCache(r->offset, size); + } + return Status::OK(); +} + +Status BlockBasedTableBuilder::Finish() { + Rep* r = rep_; + bool empty_data_block = r->data_block.empty(); + Flush(); + assert(!r->closed); + r->closed = true; + + BlockHandle filter_block_handle, + metaindex_block_handle, + index_block_handle; + + // Write filter block + if (ok() && r->filter_block != nullptr) { + auto filter_contents = r->filter_block->Finish(); + r->props.filter_size = filter_contents.size(); + WriteRawBlock(filter_contents, kNoCompression, &filter_block_handle); + } + + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries here and flush them + // to storage after metaindex block is written. + if (ok() && !empty_data_block) { + r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */, + r->pending_handle); + } + + // Write meta blocks and metaindex block with the following order. + // 1. [meta block: filter] + // 2. [meta block: properties] + // 3. [metaindex block] + if (ok()) { + MetaIndexBuilder meta_index_builer; + + // Write filter block. + if (r->filter_block != nullptr) { + // Add mapping from ".Name" to location + // of filter data. + std::string key = BlockBasedTable::kFilterBlockPrefix; + key.append(r->options.filter_policy->Name()); + meta_index_builer.Add(key, filter_block_handle); + } + + // Write properties block. + { + PropertyBlockBuilder property_block_builder; + std::vector failed_user_prop_collectors; + r->props.filter_policy_name = r->options.filter_policy != nullptr ? + r->options.filter_policy->Name() : ""; + r->props.index_size = + r->index_builder->EstimatedSize() + kBlockTrailerSize; + + // Add basic properties + property_block_builder.AddTableProperty(r->props); + + // Add use collected properties + NotifyCollectTableCollectorsOnFinish( + r->options.table_properties_collectors, + r->options.info_log.get(), + &property_block_builder + ); + + BlockHandle properties_block_handle; + WriteRawBlock( + property_block_builder.Finish(), + kNoCompression, + &properties_block_handle + ); + + meta_index_builer.Add(kPropertiesBlock, + properties_block_handle); + } // end of properties block writing + + WriteRawBlock( + meta_index_builer.Finish(), + kNoCompression, + &metaindex_block_handle + ); + } // meta blocks and metaindex block. + + // Write index block + if (ok()) { + WriteBlock(r->index_builder->Finish(), &index_block_handle); + } + + // Write footer + if (ok()) { + Footer footer(kBlockBasedTableMagicNumber); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } + } + + // Print out the table stats + if (ok()) { + // user collected properties + std::string user_collected; + user_collected.reserve(1024); + for (auto collector : r->options.table_properties_collectors) { + for (const auto& prop : collector->GetReadableProperties()) { + user_collected.append(prop.first); + user_collected.append("="); + user_collected.append(prop.second); + user_collected.append("; "); + } + } + + Log( + r->options.info_log, + "Table was constructed:\n" + " [basic properties]: %s\n" + " [user collected properties]: %s", + r->props.ToString().c_str(), + user_collected.c_str() + ); + } + + return r->status; +} + +void BlockBasedTableBuilder::Abandon() { + Rep* r = rep_; + assert(!r->closed); + r->closed = true; +} + +uint64_t BlockBasedTableBuilder::NumEntries() const { + return rep_->props.num_entries; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { + return rep_->offset; +} + +const std::string BlockBasedTable::kFilterBlockPrefix = + "filter."; + +} // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h new file mode 100644 index 00000000..5871427c --- /dev/null +++ b/table/block_based_table_builder.h @@ -0,0 +1,91 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/table_builder.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +struct BlockBasedTableOptions; + +class BlockBasedTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + BlockBasedTableBuilder(const Options& options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + private: + bool ok() const { return status().ok(); } + // Call block's Finish() method and then write the finalize block contents to + // file. + void WriteBlock(BlockBuilder* block, BlockHandle* handle); + // Directly write block content to the file. + void WriteBlock(const Slice& block_contents, BlockHandle* handle); + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle); + Status InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); + struct Rep; + class BlockBasedTablePropertiesCollector; + Rep* rep_; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // No copying allowed + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + void operator=(const BlockBasedTableBuilder&) = delete; +}; + +} // namespace rocksdb diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc new file mode 100644 index 00000000..822adee2 --- /dev/null +++ b/table/block_based_table_factory.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + + +#include "table/block_based_table_factory.h" + +#include +#include +#include + +#include "rocksdb/flush_block_policy.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_reader.h" +#include "port/port.h" + +namespace rocksdb { + +BlockBasedTableFactory::BlockBasedTableFactory( + const BlockBasedTableOptions& table_options) + : table_options_(table_options) { + if (table_options_.flush_block_policy_factory == nullptr) { + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } +} + +Status BlockBasedTableFactory::NewTableReader( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const { + return BlockBasedTable::Open(options, soptions, table_options_, + internal_comparator, std::move(file), file_size, + table_reader); +} + +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + auto table_builder = new BlockBasedTableBuilder( + options, table_options_, internal_comparator, file, compression_type); + + return table_builder; +} + +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& table_options) { + return new BlockBasedTableFactory(table_options); +} + +const std::string BlockBasedTablePropertyNames::kIndexType = + "rocksdb.block.based.table.index.type"; + +} // namespace rocksdb diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h new file mode 100644 index 00000000..492349c5 --- /dev/null +++ b/table/block_based_table_factory.h @@ -0,0 +1,48 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class BlockBasedTableBuilder; + +class BlockBasedTableFactory : public TableFactory { + public: + explicit BlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + + ~BlockBasedTableFactory() {} + + const char* Name() const override { return "BlockBasedTable"; } + + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const override; + + TableBuilder* NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const override; + + private: + BlockBasedTableOptions table_options_; +}; + +} // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc new file mode 100644 index 00000000..ab56dc9b --- /dev/null +++ b/table/block_based_table_reader.cc @@ -0,0 +1,1119 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_reader.h" + +#include +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" + +#include "table/block.h" +#include "table/filter_block.h" +#include "table/block_hash_index.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/two_level_iterator.h" + +#include "util/coding.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +extern uint64_t kBlockBasedTableMagicNumber; +using std::unique_ptr; + +typedef BlockBasedTable::IndexReader IndexReader; + +namespace { + +// The longest the prefix of the cache key used to identify blocks can be. +// We are using the fact that we know for Posix files the unique ID is three +// varints. +// For some reason, compiling for iOS complains that this variable is unused +const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = + kMaxVarint64Length * 3 + 1; + +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// Set *didIO to true if didIO is not null. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +Status ReadBlockFromFile(RandomAccessFile* file, const ReadOptions& options, + const BlockHandle& handle, Block** result, Env* env, + bool* didIO = nullptr, bool do_uncompress = true) { + BlockContents contents; + Status s = + ReadBlockContents(file, options, handle, &contents, env, do_uncompress); + if (s.ok()) { + *result = new Block(contents); + } + + if (didIO != nullptr) { + *didIO = true; + } + return s; +} + +// Delete the resource that is held by the iterator. +template +void DeleteHeldResource(void* arg, void* ignored) { + delete reinterpret_cast(arg); +} + +// Delete the entry resided in the cache. +template +void DeleteCachedEntry(const Slice& key, void* value) { + auto entry = reinterpret_cast(value); + delete entry; +} + +// Release the cached entry and decrement its ref count. +void ReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle); +} + +Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = + EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + return Slice(cache_key, static_cast(end - cache_key)); +} + +Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, + Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, + Statistics* statistics) { + auto cache_handle = block_cache->Lookup(key); + if (cache_handle != nullptr) { + BumpPerfCount(&perf_context.block_cache_hit_count); + // overall cache hit + RecordTick(statistics, BLOCK_CACHE_HIT); + // block-type specific cache hit + RecordTick(statistics, block_cache_hit_ticker); + } else { + // overall cache miss + RecordTick(statistics, BLOCK_CACHE_MISS); + // block-type specific cache miss + RecordTick(statistics, block_cache_miss_ticker); + } + + return cache_handle; +} + +} // namespace + +// -- IndexReader and its subclasses +// IndexReader is the interface that provide the functionality for index access. +class BlockBasedTable::IndexReader { + public: + explicit IndexReader(const Comparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexReader() {} + + // Create an iterator for index access. + virtual Iterator* NewIterator() = 0; + + // The size of the index. + virtual size_t size() const = 0; + + protected: + const Comparator* comparator_; +}; + +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public IndexReader { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(RandomAccessFile* file, const BlockHandle& index_handle, + Env* env, const Comparator* comparator, + IndexReader** index_reader) { + Block* index_block = nullptr; + auto s = ReadBlockFromFile(file, ReadOptions(), index_handle, + &index_block, env); + + if (s.ok()) { + *index_reader = new BinarySearchIndexReader(comparator, index_block); + } + + return s; + } + + virtual Iterator* NewIterator() override { + return index_block_->NewIterator(comparator_); + } + + virtual size_t size() const override { return index_block_->size(); } + + private: + BinarySearchIndexReader(const Comparator* comparator, Block* index_block) + : IndexReader(comparator), index_block_(index_block) { + assert(index_block_ != nullptr); + } + std::unique_ptr index_block_; +}; + +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that +// functions requires index to be initalized. To avoid this problem external +// caller will pass a function that can create the iterator over the entries +// without the table to be fully initialized. +class HashIndexReader : public IndexReader { + public: + static Status Create(RandomAccessFile* file, const BlockHandle& index_handle, + Env* env, const Comparator* comparator, + std::function data_iter_gen, + const SliceTransform* prefix_extractor, + IndexReader** index_reader) { + assert(prefix_extractor); + Block* index_block = nullptr; + auto s = + ReadBlockFromFile(file, ReadOptions(), index_handle, &index_block, env); + + if (!s.ok()) { + return s; + } + + *index_reader = new HashIndexReader(comparator, index_block); + std::unique_ptr index_iter(index_block->NewIterator(nullptr)); + std::unique_ptr data_iter( + data_iter_gen(index_block->NewIterator(nullptr))); + auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(), + index_block->NumRestarts(), + comparator, prefix_extractor); + index_block->SetBlockHashIndex(hash_index); + return s; + } + + virtual Iterator* NewIterator() override { + return index_block_->NewIterator(comparator_); + } + + virtual size_t size() const override { return index_block_->size(); } + + private: + HashIndexReader(const Comparator* comparator, Block* index_block) + : IndexReader(comparator), index_block_(index_block) { + assert(index_block_ != nullptr); + } + std::unique_ptr index_block_; +}; + + +struct BlockBasedTable::Rep { + Rep(const EnvOptions& storage_options, + const InternalKeyComparator& internal_comparator) + : soptions(storage_options), internal_comparator(internal_comparator) {} + + Options options; + const EnvOptions& soptions; + const InternalKeyComparator& internal_comparator; + Status status; + unique_ptr file; + char cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size = 0; + char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size = 0; + + // Handle to metaindex_block: saved from footer + BlockHandle metaindex_handle; + // Handle to index: saved from footer + BlockHandle index_handle; + // index_reader and filter will be populated and used only when + // options.block_cache is nullptr; otherwise we will get the index block via + // the block cache. + unique_ptr index_reader; + unique_ptr filter; + + std::shared_ptr table_properties; + BlockBasedTableOptions::IndexType index_type; + // TODO(kailiu) It is very ugly to use internal key in table, since table + // module should not be relying on db module. However to make things easier + // and compatible with existing code, we introduce a wrapper that allows + // block to extract prefix without knowing if a key is internal or not. + unique_ptr internal_prefix_transform; +}; + +BlockBasedTable::~BlockBasedTable() { + delete rep_; +} + +// CachableEntry represents the entries that *may* be fetched from block cache. +// field `value` is the item we want to get. +// field `cache_handle` is the cache handle to the block cache. If the value +// was not read from cache, `cache_handle` will be nullptr. +template +struct BlockBasedTable::CachableEntry { + CachableEntry(TValue* value, Cache::Handle* cache_handle) + : value(value) + , cache_handle(cache_handle) { + } + CachableEntry(): CachableEntry(nullptr, nullptr) { } + void Release(Cache* cache) { + if (cache_handle) { + cache->Release(cache_handle); + value = nullptr; + cache_handle = nullptr; + } + } + + TValue* value = nullptr; + // if the entry is from the cache, cache_handle will be populated. + Cache::Handle* cache_handle = nullptr; +}; + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + rep->compressed_cache_key_prefix_size = 0; + if (rep->options.block_cache != nullptr) { + GenerateCachePrefix(rep->options.block_cache.get(), rep->file.get(), + &rep->cache_key_prefix[0], + &rep->cache_key_prefix_size); + } + if (rep->options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->options.block_cache_compressed.get(), + rep->file.get(), &rep->compressed_cache_key_prefix[0], + &rep->compressed_cache_key_prefix_size); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, + RandomAccessFile* file, char* buffer, size_t* size) { + + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, + WritableFile* file, char* buffer, size_t* size) { + + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader) { + table_reader->reset(); + + Footer footer(kBlockBasedTableMagicNumber); + auto s = ReadFooterFromFile(file.get(), file_size, &footer); + if (!s.ok()) return s; + + // We've successfully read the footer and the index block: we're + // ready to serve requests. + Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator); + rep->options = options; + rep->file = std::move(file); + rep->metaindex_handle = footer.metaindex_handle(); + rep->index_handle = footer.index_handle(); + rep->index_type = table_options.index_type; + SetupCacheKeyPrefix(rep); + unique_ptr new_table(new BlockBasedTable(rep)); + + // Read meta index + std::unique_ptr meta; + std::unique_ptr meta_iter; + s = ReadMetaBlock(rep, &meta, &meta_iter); + + // Read the properties + bool found_properties_block = true; + meta_iter->Seek(kPropertiesBlock); + if (meta_iter->status().ok() && + (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlock)) { + meta_iter->Seek(kPropertiesBlockOldName); + if (meta_iter->status().ok() && + (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlockOldName)) { + found_properties_block = false; + Log(WARN, rep->options.info_log, + "Cannot find Properties block from file."); + } + } + + if (found_properties_block) { + s = meta_iter->status(); + TableProperties* table_properties = nullptr; + if (s.ok()) { + s = ReadProperties(meta_iter->value(), rep->file.get(), rep->options.env, + rep->options.info_log.get(), &table_properties); + } + + if (!s.ok()) { + auto err_msg = + "[Warning] Encountered error while reading data from properties " + "block " + s.ToString(); + Log(rep->options.info_log, "%s", err_msg.c_str()); + } else { + rep->table_properties.reset(table_properties); + } + } + + // Will use block cache for index/filter blocks access? + if (options.block_cache && table_options.cache_index_and_filter_blocks) { + // Hack: Call NewIndexIterator() to implicitly add index to the block_cache + unique_ptr iter(new_table->NewIndexIterator(ReadOptions())); + s = iter->status(); + + if (s.ok()) { + // Hack: Call GetFilter() to implicitly add filter to the block_cache + auto filter_entry = new_table->GetFilter(); + filter_entry.Release(options.block_cache.get()); + } + } else { + // If we don't use block cache for index/filter blocks access, we'll + // pre-load these blocks, which will kept in member variables in Rep + // and with a same life-time as this table object. + IndexReader* index_reader = nullptr; + // TODO: we never really verify check sum for index block + s = new_table->CreateIndexReader(&index_reader); + + if (s.ok()) { + rep->index_reader.reset(index_reader); + + // Set filter block + if (rep->options.filter_policy) { + std::string key = kFilterBlockPrefix; + key.append(rep->options.filter_policy->Name()); + meta_iter->Seek(key); + + if (meta_iter->Valid() && meta_iter->key() == Slice(key)) { + rep->filter.reset(ReadFilter(meta_iter->value(), rep)); + } + } + } else { + delete index_reader; + } + } + + if (s.ok()) { + *table_reader = std::move(new_table); + } + + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->options.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } + compaction_optimized_ = true; +} + +std::shared_ptr BlockBasedTable::GetTableProperties() + const { + return rep_->table_properties; +} + +// Load the meta-block from the file. On success, return the loaded meta block +// and its iterator. +Status BlockBasedTable::ReadMetaBlock( + Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + // TODO: we never really verify check sum for meta index block + Block* meta = nullptr; + Status s = ReadBlockFromFile( + rep->file.get(), + ReadOptions(), + rep->metaindex_handle, + &meta, + rep->options.env); + + if (!s.ok()) { + auto err_msg = + "[Warning] Encountered error while reading data from properties" + "block " + s.ToString(); + Log(rep->options.info_log, "%s", err_msg.c_str()); + } + if (!s.ok()) { + delete meta; + return s; + } + + meta_block->reset(meta); + // meta block uses bytewise comparator. + iter->reset(meta->NewIterator(BytewiseComparator())); + return Status::OK(); +} + +Status BlockBasedTable::GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, + const ReadOptions& read_options, + BlockBasedTable::CachableEntry* block) { + Status s; + Block* compressed_block = nullptr; + Cache::Handle* block_cache_compressed_handle = nullptr; + + // Lookup uncompressed cache first + if (block_cache != nullptr) { + block->cache_handle = + GetEntryFromCache(block_cache, block_cache_key, BLOCK_CACHE_DATA_MISS, + BLOCK_CACHE_DATA_HIT, statistics); + if (block->cache_handle != nullptr) { + block->value = + reinterpret_cast(block_cache->Value(block->cache_handle)); + return s; + } + } + + // If not found, search from the compressed block cache. + assert(block->cache_handle == nullptr && block->value == nullptr); + + if (block_cache_compressed == nullptr) { + return s; + } + + assert(!compressed_block_cache_key.empty()); + block_cache_compressed_handle = + block_cache_compressed->Lookup(compressed_block_cache_key); + // if we found in the compressed cache, then uncompress and insert into + // uncompressed cache + if (block_cache_compressed_handle == nullptr) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + return s; + } + + // found compressed block + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); + compressed_block = reinterpret_cast( + block_cache_compressed->Value(block_cache_compressed_handle)); + assert(compressed_block->compression_type() != kNoCompression); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + s = UncompressBlockContents(compressed_block->data(), + compressed_block->size(), &contents); + + // Insert uncompressed block into block cache + if (s.ok()) { + block->value = new Block(contents); // uncompressed block + assert(block->value->compression_type() == kNoCompression); + if (block_cache != nullptr && block->value->cachable() && + read_options.fill_cache) { + block->cache_handle = + block_cache->Insert(block_cache_key, block->value, + block->value->size(), &DeleteCachedEntry); + assert(reinterpret_cast( + block_cache->Value(block->cache_handle)) == block->value); + } + } + + // Release hold on compressed cache entry + block_cache_compressed->Release(block_cache_compressed_handle); + return s; +} + +Status BlockBasedTable::PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, Statistics* statistics, + CachableEntry* block, Block* raw_block) { + assert(raw_block->compression_type() == kNoCompression || + block_cache_compressed != nullptr); + + Status s; + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + if (raw_block->compression_type() != kNoCompression) { + s = UncompressBlockContents(raw_block->data(), raw_block->size(), + &contents); + } + if (!s.ok()) { + delete raw_block; + return s; + } + + if (raw_block->compression_type() != kNoCompression) { + block->value = new Block(contents); // uncompressed block + } else { + block->value = raw_block; + raw_block = nullptr; + } + + // Insert compressed block into compressed block cache. + // Release the hold on the compressed cache entry immediately. + if (block_cache_compressed != nullptr && raw_block != nullptr && + raw_block->cachable()) { + auto cache_handle = block_cache_compressed->Insert( + compressed_block_cache_key, raw_block, raw_block->size(), + &DeleteCachedEntry); + block_cache_compressed->Release(cache_handle); + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + // Avoid the following code to delete this cached block. + raw_block = nullptr; + } + delete raw_block; + + // insert into uncompressed block cache + assert((block->value->compression_type() == kNoCompression)); + if (block_cache != nullptr && block->value->cachable()) { + block->cache_handle = + block_cache->Insert(block_cache_key, block->value, block->value->size(), + &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + assert(reinterpret_cast(block_cache->Value(block->cache_handle)) == + block->value); + } + + return s; +} + +FilterBlockReader* BlockBasedTable::ReadFilter ( + const Slice& filter_handle_value, + BlockBasedTable::Rep* rep, + size_t* filter_size) { + Slice v = filter_handle_value; + BlockHandle filter_handle; + if (!filter_handle.DecodeFrom(&v).ok()) { + return nullptr; + } + + // TODO: We might want to unify with ReadBlockFromFile() if we start + // requiring checksum verification in Table::Open. + ReadOptions opt; + BlockContents block; + if (!ReadBlockContents(rep->file.get(), opt, filter_handle, &block, + rep->options.env, false).ok()) { + return nullptr; + } + + if (filter_size) { + *filter_size = block.data.size(); + } + + return new FilterBlockReader( + rep->options, block.data, block.heap_allocated); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +Iterator* BlockBasedTable::DataBlockReader(void* arg, + const ReadOptions& options, + const Slice& index_value, + bool* didIO, bool for_compaction) { + const bool no_io = (options.read_tier == kBlockCacheTier); + BlockBasedTable* table = reinterpret_cast(arg); + Cache* block_cache = table->rep_->options.block_cache.get(); + Cache* block_cache_compressed = table->rep_->options. + block_cache_compressed.get(); + CachableEntry block; + + BlockHandle handle; + Slice input = index_value; + // We intentionally allow extra stuff in index_value so that we + // can add more features in the future. + Status s = handle.DecodeFrom(&input); + + if (!s.ok()) { + return NewErrorIterator(s); + } + + // If either block cache is enabled, we'll try to read from it. + if (block_cache != nullptr || block_cache_compressed != nullptr) { + Statistics* statistics = table->rep_->options.statistics.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key, /* key to the block cache */ + ckey /* key to the compressed block cache */; + + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey(table->rep_->cache_key_prefix, + table->rep_->cache_key_prefix_size, handle, cache_key); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey(table->rep_->compressed_cache_key_prefix, + table->rep_->compressed_cache_key_prefix_size, handle, + compressed_cache_key); + } + + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + statistics, options, &block); + + if (block.value == nullptr && !no_io && options.fill_cache) { + Histograms histogram = for_compaction ? + READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; + Block* raw_block = nullptr; + { + StopWatch sw(table->rep_->options.env, statistics, histogram); + s = ReadBlockFromFile(table->rep_->file.get(), options, handle, + &raw_block, table->rep_->options.env, didIO, + block_cache_compressed == nullptr); + } + + if (s.ok()) { + s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, + options, statistics, &block, raw_block); + } + } + } + + // Didn't get any data from block caches. + if (block.value == nullptr) { + if (no_io) { + // Could not read from block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); + } + s = ReadBlockFromFile(table->rep_->file.get(), options, handle, + &block.value, table->rep_->options.env, didIO); + } + + Iterator* iter; + if (block.value != nullptr) { + iter = block.value->NewIterator(&table->rep_->internal_comparator); + if (block.cache_handle != nullptr) { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, + block.cache_handle); + } else { + iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); + } + } else { + iter = NewErrorIterator(s); + } + return iter; +} + +BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( + bool no_io) const { + // filter pre-populated + if (rep_->filter != nullptr) { + return {rep_->filter.get(), nullptr /* cache handle */}; + } + + if (rep_->options.filter_policy == nullptr /* do not use filter at all */ || + rep_->options.block_cache == nullptr /* no block cache at all */) { + return {nullptr /* filter */, nullptr /* cache handle */}; + } + + // Fetching from the cache + Cache* block_cache = rep_->options.block_cache.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey( + rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + rep_->metaindex_handle, + cache_key + ); + + Statistics* statistics = rep_->options.statistics.get(); + auto cache_handle = + GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, + BLOCK_CACHE_FILTER_HIT, statistics); + + FilterBlockReader* filter = nullptr; + if (cache_handle != nullptr) { + filter = reinterpret_cast( + block_cache->Value(cache_handle)); + } else if (no_io) { + // Do not invoke any io. + return CachableEntry(); + } else { + size_t filter_size = 0; + std::unique_ptr meta; + std::unique_ptr iter; + auto s = ReadMetaBlock(rep_, &meta, &iter); + + if (s.ok()) { + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(rep_->options.filter_policy->Name()); + iter->Seek(filter_block_key); + + if (iter->Valid() && iter->key() == Slice(filter_block_key)) { + filter = ReadFilter(iter->value(), rep_, &filter_size); + assert(filter); + assert(filter_size > 0); + + cache_handle = block_cache->Insert( + key, filter, filter_size, &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + } + } + } + + return { filter, cache_handle }; +} + +Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) { + // index reader has already been pre-populated. + if (rep_->index_reader) { + return rep_->index_reader->NewIterator(); + } + + bool no_io = read_options.read_tier == kBlockCacheTier; + Cache* block_cache = rep_->options.block_cache.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->index_handle, cache_key); + Statistics* statistics = rep_->options.statistics.get(); + auto cache_handle = + GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, + BLOCK_CACHE_INDEX_HIT, statistics); + + if (cache_handle == nullptr && no_io) { + return NewErrorIterator(Status::Incomplete("no blocking io")); + } + + IndexReader* index_reader = nullptr; + if (cache_handle != nullptr) { + index_reader = + reinterpret_cast(block_cache->Value(cache_handle)); + } else { + // Create index reader and put it in the cache. + Status s; + s = CreateIndexReader(&index_reader); + + if (!s.ok()) { + // make sure if something goes wrong, index_reader shall remain intact. + assert(index_reader == nullptr); + return NewErrorIterator(s); + } + + cache_handle = block_cache->Insert(key, index_reader, index_reader->size(), + &DeleteCachedEntry); + RecordTick(statistics, BLOCK_CACHE_ADD); + } + + assert(cache_handle); + auto iter = index_reader->NewIterator(); + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + + return iter; +} + +Iterator* BlockBasedTable::DataBlockReader( + void* arg, const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& icomparator, const Slice& index_value, + bool for_compaction) { + return DataBlockReader(arg, options, index_value, nullptr, for_compaction); +} + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in Options.filter_policy. In particular, we +// require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// Otherwise, this method guarantees no I/O will be incurred. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixMayMatch(const Slice& internal_prefix) { + bool may_match = true; + Status s; + + if (!rep_->options.filter_policy) { + return true; + } + + // To prevent any io operation in this method, we set `read_tier` to make + // sure we always read index or filter only when they have already been + // loaded to memory. + ReadOptions no_io_read_options; + no_io_read_options.read_tier = kBlockCacheTier; + unique_ptr iiter(NewIndexIterator(no_io_read_options)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only one which could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + auto filter_entry = GetFilter(true /* no io */); + may_match = + filter_entry.value == nullptr || + filter_entry.value->PrefixMayMatch(handle.offset(), internal_prefix); + filter_entry.Release(rep_->options.block_cache.get()); + } + + Statistics* statistics = rep_->options.statistics.get(); + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + + return may_match; +} + +Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) { + if (options.prefix) { + InternalKey internal_prefix(*options.prefix, 0, kTypeValue); + if (!PrefixMayMatch(internal_prefix.Encode())) { + // nothing in this file can match the prefix, so we should not + // bother doing I/O to this file when iterating. + return NewEmptyIterator(); + } + } + + return NewTwoLevelIterator(NewIndexIterator(options), + &BlockBasedTable::DataBlockReader, + const_cast(this), options, + rep_->soptions, rep_->internal_comparator); +} + +Status BlockBasedTable::Get( + const ReadOptions& read_options, const Slice& key, void* handle_context, + bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context)) { + Status s; + Iterator* iiter = NewIndexIterator(read_options); + auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); + FilterBlockReader* filter = filter_entry.value; + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + Slice handle_value = iiter->value(); + + BlockHandle handle; + bool may_not_exist_in_filter = + filter != nullptr && + handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(handle.offset(), key); + + if (may_not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); + break; + } else { + bool didIO = false; + unique_ptr block_iter( + DataBlockReader(this, read_options, iiter->value(), &didIO)); + + if (read_options.read_tier && block_iter->status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for whether + // we can guarantee the key is not there when "no_io" is set + (*mark_key_may_exist_handler)(handle_context); + break; + } + + // Call the *saver function on each entry/block until it returns false + for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(block_iter->key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!(*result_handler)(handle_context, parsed_key, block_iter->value(), + didIO)) { + done = true; + break; + } + } + s = block_iter->status(); + } + } + + filter_entry.Release(rep_->options.block_cache.get()); + if (s.ok()) { + s = iiter->status(); + } + delete iiter; + return s; +} + +namespace { +bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value, + bool didIO) { + *reinterpret_cast(arg) = didIO; + return false; +} +} // namespace + +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + // We use Get() as it has logic that checks whether we read the + // block from the disk or not. + bool didIO = false; + Status s = Get(options, key, &didIO, SaveDidIO); + assert(s.ok()); + return !didIO; +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) { + // Some old version of block-based tables don't have index type present in + // table properties. If that's the case we can safely use the kBinarySearch. + auto index_type = BlockBasedTableOptions::kBinarySearch; + if (rep_->table_properties) { + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + index_type = static_cast( + DecodeFixed32(pos->second.c_str())); + } + } + + auto file = rep_->file.get(); + const auto& index_handle = rep_->index_handle; + auto env = rep_->options.env; + auto comparator = &rep_->internal_comparator; + + switch (index_type) { + case BlockBasedTableOptions::kBinarySearch: { + return BinarySearchIndexReader::Create(file, index_handle, env, + comparator, index_reader); + } + case BlockBasedTableOptions::kHashSearch: { + // We need to wrap data with internal_prefix_transform to make sure it can + // handle prefix correctly. + rep_->internal_prefix_transform.reset( + new InternalKeySliceTransform(rep_->options.prefix_extractor.get())); + return HashIndexReader::Create( + file, index_handle, env, comparator, + [&](Iterator* index_iter) { + return NewTwoLevelIterator( + index_iter, &BlockBasedTable::DataBlockReader, + const_cast(this), ReadOptions(), + rep_->soptions, rep_->internal_comparator); + }, + rep_->internal_prefix_transform.get(), index_reader); + } + default: { + std::string error_message = + "Unrecognized index type: " + std::to_string(rep_->index_type); + // equivalent to assert(false), but more informative. + assert(!error_message.c_str()); + return Status::InvalidArgument(error_message.c_str()); + } + } +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { + unique_ptr index_iter(NewIndexIterator(ReadOptions())); + + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle; + Slice input = index_iter->value(); + Status s = handle.DecodeFrom(&input); + if (s.ok()) { + result = handle.offset(); + } else { + // Strange: we can't decode the block handle in the index block. + // We'll just return the offset of the metaindex block, which is + // close to the whole file size for this case. + result = rep_->metaindex_handle.offset(); + } + } else { + // key is past the last key in the file. If table_properties is not + // available, approximate the offset by returning the offset of the + // metaindex block (which is right near the end of the file). + result = 0; + if (rep_->table_properties) { + result = rep_->table_properties->data_size; + } + // table_properties is not present in the table. + if (result == 0) { + result = rep_->metaindex_handle.offset(); + } + } + return result; +} + +bool BlockBasedTable::TEST_filter_block_preloaded() const { + return rep_->filter != nullptr; +} + +bool BlockBasedTable::TEST_index_reader_preloaded() const { + return rep_->index_reader != nullptr; +} + +} // namespace rocksdb diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h new file mode 100644 index 00000000..5cff9ce6 --- /dev/null +++ b/table/block_based_table_reader.h @@ -0,0 +1,206 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" +#include "util/coding.h" + +namespace rocksdb { + +class Block; +class BlockHandle; +class Cache; +class FilterBlockReader; +class Footer; +class InternalKeyComparator; +class Iterator; +class RandomAccessFile; +class TableCache; +class TableReader; +class WritableFile; +struct BlockBasedTableOptions; +struct EnvOptions; +struct Options; +struct ReadOptions; + +using std::unique_ptr; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class BlockBasedTable : public TableReader { + public: + static const std::string kFilterBlockPrefix; + + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table_reader" to the newly opened + // table. The client should delete "*table_reader" when no longer needed. + // If there was an error while initializing the table, sets "*table_reader" + // to nullptr and returns a non-ok status. + // + // *file must remain live while this Table is in use. + static Status Open(const Options& db_options, const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader); + + bool PrefixMayMatch(const Slice& internal_prefix) override; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + Iterator* NewIterator(const ReadOptions&) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + void* handle_context, + bool (*result_handler)(void* handle_context, + const ParsedInternalKey& k, const Slice& v, + bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = + nullptr) override; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) override; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table. + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + void SetupForCompaction() override; + + std::shared_ptr GetTableProperties() const override; + + ~BlockBasedTable(); + + bool TEST_filter_block_preloaded() const; + bool TEST_index_reader_preloaded() const; + // Implementation of IndexReader will be exposed to internal cc file only. + class IndexReader; + + private: + template + struct CachableEntry; + + struct Rep; + Rep* rep_; + bool compaction_optimized_; + + static Iterator* DataBlockReader(void*, const ReadOptions&, + const EnvOptions& soptions, + const InternalKeyComparator& icomparator, + const Slice&, bool for_compaction); + + static Iterator* DataBlockReader(void*, const ReadOptions&, const Slice&, + bool* didIO, bool for_compaction = false); + + // For the following two functions: + // if `no_io == true`, we will not try to read filter/index from sst file + // were they not present in cache yet. + CachableEntry GetFilter(bool no_io = false) const; + + // Get the iterator from the index reader. + // + // Note: ErrorIterator with Status::Incomplete shall be returned if all the + // following conditions are met: + // 1. We enabled table_options.cache_index_and_filter_blocks. + // 2. index is not present in block cache. + // 3. We disallowed any io to be performed, that is, read_options == + // kBlockCacheTier + Iterator* NewIndexIterator(const ReadOptions& read_options); + + // Read block cache from block caches (if set): block_cache and + // block_cache_compressed. + // On success, Status::OK with be returned and @block will be populated with + // pointer to the block as well as its block handle. + static Status GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, + const ReadOptions& read_options, + BlockBasedTable::CachableEntry* block); + // Put a raw block (maybe compressed) to the corresponding block caches. + // This method will perform decompression against raw_block if needed and then + // populate the block caches. + // On success, Status::OK will be returned; also @block will be populated with + // uncompressed block and its cache handle. + // + // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be + // responsible for releasing its memory if error occurs. + static Status PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, Statistics* statistics, + CachableEntry* block, Block* raw_block); + + // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found + // after a call to Seek(key), until handle_result returns false. + // May not make such a call if filter policy says that key is not present. + friend class TableCache; + friend class BlockBasedTableBuilder; + + void ReadMeta(const Footer& footer); + void ReadFilter(const Slice& filter_handle_value); + Status CreateIndexReader(IndexReader** index_reader); + + // Read the meta block from sst. + static Status ReadMetaBlock( + Rep* rep, + std::unique_ptr* meta_block, + std::unique_ptr* iter); + + // Create the filter from the filter block. + static FilterBlockReader* ReadFilter( + const Slice& filter_handle_value, + Rep* rep, + size_t* filter_size = nullptr); + + static void SetupCacheKeyPrefix(Rep* rep); + + explicit BlockBasedTable(Rep* rep) + : rep_(rep), compaction_optimized_(false) {} + + // Generate a cache key prefix from the file + static void GenerateCachePrefix(Cache* cc, + RandomAccessFile* file, char* buffer, size_t* size); + static void GenerateCachePrefix(Cache* cc, + WritableFile* file, char* buffer, size_t* size); + + // The longest prefix of the cache key used to identify blocks. + // For Posix files the unique ID is three varints. + static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; + + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; + +// Backward compatible properties block name. Limited in block based +// table. +extern const std::string kPropertiesBlockOldName; + +} // namespace rocksdb diff --git a/table/block_builder.cc b/table/block_builder.cc new file mode 100644 index 00000000..f812dbae --- /dev/null +++ b/table/block_builder.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include +#include +#include "rocksdb/comparator.h" +#include "db/dbformat.h" +#include "util/coding.h" + +namespace rocksdb { + +BlockBuilder::BlockBuilder(int block_restart_interval, + const Comparator* comparator) + : block_restart_interval_(block_restart_interval), + comparator_(comparator), + restarts_(), + counter_(0), + finished_(false) { + assert(block_restart_interval_ >= 1); + restarts_.push_back(0); // First restart point is at offset 0 +} + +BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator) + : BlockBuilder(options.block_restart_interval, comparator) {} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + counter_ = 0; + finished_ = false; + last_key_.clear(); +} + +size_t BlockBuilder::CurrentSizeEstimate() const { + return (buffer_.size() + // Raw data buffer + restarts_.size() * sizeof(uint32_t) + // Restart array + sizeof(uint32_t)); // Restart array length +} + +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) + const { + size_t estimate = CurrentSizeEstimate(); + estimate += key.size() + value.size(); + if (counter_ >= block_restart_interval_) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + estimate += VarintLength(key.size()); // varint for key length. + estimate += VarintLength(value.size()); // varint for value length. + + return estimate; +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + PutFixed32(&buffer_, restarts_.size()); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value) { + Slice last_key_piece(last_key_); + assert(!finished_); + assert(counter_ <= block_restart_interval_); + assert(buffer_.empty() // No values yet? + || comparator_->Compare(key, last_key_piece) > 0); + size_t shared = 0; + if (counter_ < block_restart_interval_) { + // See how much sharing to do with previous string + const size_t min_length = std::min(last_key_piece.size(), key.size()); + while ((shared < min_length) && (last_key_piece[shared] == key[shared])) { + shared++; + } + } else { + // Restart compression + restarts_.push_back(buffer_.size()); + counter_ = 0; + } + const size_t non_shared = key.size() - shared; + + // Add "" to buffer_ + PutVarint32(&buffer_, shared); + PutVarint32(&buffer_, non_shared); + PutVarint32(&buffer_, value.size()); + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + buffer_.append(value.data(), value.size()); + + // Update state + last_key_.resize(shared); + last_key_.append(key.data() + shared, non_shared); + assert(Slice(last_key_) == key); + counter_++; +} + +} // namespace rocksdb diff --git a/table/block_builder.h b/table/block_builder.h new file mode 100644 index 00000000..ed2f290f --- /dev/null +++ b/table/block_builder.h @@ -0,0 +1,65 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include "rocksdb/slice.h" + +namespace rocksdb { + +struct Options; +class Comparator; + +class BlockBuilder { + public: + BlockBuilder(int block_builder, const Comparator* comparator); + explicit BlockBuilder(const Options& options, const Comparator* comparator); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been callled since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + size_t CurrentSizeEstimate() const; + + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { + return buffer_.empty(); + } + + private: + const int block_restart_interval_; + const Comparator* comparator_; + + std::string buffer_; // Destination buffer + std::vector restarts_; // Restart points + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + + // No copying allowed + BlockBuilder(const BlockBuilder&); + void operator=(const BlockBuilder&); +}; + +} // namespace rocksdb diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc new file mode 100644 index 00000000..0c9674c9 --- /dev/null +++ b/table/block_hash_index.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "table/block_hash_index.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter, + const uint32_t num_restarts, + const Comparator* comparator, + const SliceTransform* hash_key_extractor) { + assert(hash_key_extractor); + auto hash_index = new BlockHashIndex(hash_key_extractor); + uint64_t current_restart_index = 0; + + std::string pending_entry_prefix; + // pending_block_num == 0 also implies there is no entry inserted at all. + uint32_t pending_block_num = 0; + uint32_t pending_entry_index = 0; + + // scan all the entries and create a hash index based on their prefixes. + data_iter->SeekToFirst(); + for (index_iter->SeekToFirst(); + index_iter->Valid() && current_restart_index < num_restarts; + index_iter->Next()) { + Slice last_key_in_block = index_iter->key(); + assert(data_iter->Valid() && data_iter->status().ok()); + + // scan through all entries within a data block. + while (data_iter->Valid() && + comparator->Compare(data_iter->key(), last_key_in_block) <= 0) { + auto key_prefix = hash_key_extractor->Transform(data_iter->key()); + bool is_first_entry = pending_block_num == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix != key_prefix) { + if (!is_first_entry) { + bool succeeded = hash_index->Add( + pending_entry_prefix, pending_entry_index, pending_block_num); + if (!succeeded) { + delete hash_index; + return nullptr; + } + } + + // update the status. + // needs a hard copy otherwise the underlying data changes all the time. + pending_entry_prefix = key_prefix.ToString(); + pending_block_num = 1; + pending_entry_index = current_restart_index; + } else { + // entry number increments when keys share the prefix reside in + // differnt data blocks. + auto last_restart_index = pending_entry_index + pending_block_num - 1; + assert(last_restart_index <= current_restart_index); + if (last_restart_index != current_restart_index) { + ++pending_block_num; + } + } + data_iter->Next(); + } + + ++current_restart_index; + } + + // make sure all entries has been scaned. + assert(!index_iter->Valid()); + assert(!data_iter->Valid()); + + if (pending_block_num > 0) { + auto succeeded = hash_index->Add(pending_entry_prefix, pending_entry_index, + pending_block_num); + if (!succeeded) { + delete hash_index; + return nullptr; + } + } + + return hash_index; +} + +bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index, + uint32_t num_blocks) { + auto prefix_ptr = arena_.Allocate(prefix.size()); + std::copy(prefix.data() /* begin */, prefix.data() + prefix.size() /* end */, + prefix_ptr /* destination */); + auto result = + restart_indices_.insert({Slice(prefix_ptr, prefix.size()), + RestartIndex(restart_index, num_blocks)}); + return result.second; +} + +const BlockHashIndex::RestartIndex* BlockHashIndex::GetRestartIndex( + const Slice& key) { + auto key_prefix = hash_key_extractor_->Transform(key); + + auto pos = restart_indices_.find(key_prefix); + if (pos == restart_indices_.end()) { + return nullptr; + } + + return &pos->second; +} + +} // namespace rocksdb diff --git a/table/block_hash_index.h b/table/block_hash_index.h new file mode 100644 index 00000000..0ff65b41 --- /dev/null +++ b/table/block_hash_index.h @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include + +#include "util/arena.h" +#include "util/murmurhash.h" + +namespace rocksdb { + +class Comparator; +class Iterator; +class Slice; +class SliceTransform; + +// Build a hash-based index to speed up the lookup for "index block". +// BlockHashIndex accepts a key and, if found, returns its restart index within +// that index block. +class BlockHashIndex { + public: + // Represents a restart index in the index block's restart array. + struct RestartIndex { + explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1) + : first_index(first_index), num_blocks(num_blocks) {} + + // For a given prefix, what is the restart index for the first data block + // that contains it. + uint32_t first_index = 0; + + // How many data blocks contains this prefix? + uint32_t num_blocks = 1; + }; + + explicit BlockHashIndex(const SliceTransform* hash_key_extractor) + : hash_key_extractor_(hash_key_extractor) {} + + // Maps a key to its restart first_index. + // Returns nullptr if the restart first_index is found + const RestartIndex* GetRestartIndex(const Slice& key); + + bool Add(const Slice& key_prefix, uint32_t restart_index, + uint32_t num_blocks); + + size_t ApproximateMemoryUsage() const { + return arena_.ApproximateMemoryUsage(); + } + + private: + const SliceTransform* hash_key_extractor_; + std::unordered_map restart_indices_; + Arena arena_; +}; + +// Create hash index by scanning the entries in index as well as the whole +// dataset. +// @params index_iter: an iterator with the pointer to the first entry in a +// block. +// @params data_iter: an iterator that can scan all the entries reside in a +// table. +// @params num_restarts: used for correctness verification. +// @params hash_key_extractor: extract the hashable part of a given key. +// On error, nullptr will be returned. +BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter, + const uint32_t num_restarts, + const Comparator* comparator, + const SliceTransform* hash_key_extractor); + +} // namespace rocksdb diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc new file mode 100644 index 00000000..f4c0ac4a --- /dev/null +++ b/table/block_hash_index_test.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "table/block_hash_index.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +typedef std::map Data; + +class MapIterator : public Iterator { + public: + explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {} + + virtual bool Valid() const { return pos_ != data_.end(); } + + virtual void SeekToFirst() { pos_ = data_.begin(); } + + virtual void SeekToLast() { + pos_ = data_.end(); + --pos_; + } + + virtual void Seek(const Slice& target) { + pos_ = data_.find(target.ToString()); + } + + virtual void Next() { ++pos_; } + + virtual void Prev() { --pos_; } + + virtual Slice key() const { return pos_->first; } + + virtual Slice value() const { return pos_->second; } + + virtual Status status() const { return Status::OK(); } + + private: + const Data& data_; + Data::const_iterator pos_; +}; + +class BlockTest {}; + +TEST(BlockTest, BasicTest) { + const size_t keys_per_block = 4; + const size_t prefix_size = 2; + std::vector keys = {/* block 1 */ + "0101", "0102", "0103", "0201", + /* block 2 */ + "0202", "0203", "0301", "0401", + /* block 3 */ + "0501", "0601", "0701", "0801", + /* block 4 */ + "0802", "0803", "0804", "0805", + /* block 5 */ + "0806", "0807", "0808", "0809", }; + + Data data_entries; + for (const auto key : keys) { + data_entries.insert({key, key}); + } + + Data index_entries; + for (size_t i = 3; i < keys.size(); i += keys_per_block) { + // simply ignore the value part + index_entries.insert({keys[i], ""}); + } + + MapIterator data_iter(data_entries); + MapIterator index_iter(index_entries); + + auto prefix_extractor = NewFixedPrefixTransform(prefix_size); + std::unique_ptr block_hash_index( + CreateBlockHashIndex(&index_iter, &data_iter, index_entries.size(), + BytewiseComparator(), prefix_extractor)); + + std::map expected = { + {"01xx", BlockHashIndex::RestartIndex(0, 1)}, + {"02yy", BlockHashIndex::RestartIndex(0, 2)}, + {"03zz", BlockHashIndex::RestartIndex(1, 1)}, + {"04pp", BlockHashIndex::RestartIndex(1, 1)}, + {"05ww", BlockHashIndex::RestartIndex(2, 1)}, + {"06xx", BlockHashIndex::RestartIndex(2, 1)}, + {"07pp", BlockHashIndex::RestartIndex(2, 1)}, + {"08xz", BlockHashIndex::RestartIndex(2, 3)}, }; + + const BlockHashIndex::RestartIndex* index = nullptr; + // search existed prefixes + for (const auto& item : expected) { + index = block_hash_index->GetRestartIndex(item.first); + ASSERT_TRUE(index != nullptr); + ASSERT_EQ(item.second.first_index, index->first_index); + ASSERT_EQ(item.second.num_blocks, index->num_blocks); + } + + // search non exist prefixes + ASSERT_TRUE(!block_hash_index->GetRestartIndex("00xx")); + ASSERT_TRUE(!block_hash_index->GetRestartIndex("10yy")); + ASSERT_TRUE(!block_hash_index->GetRestartIndex("20zz")); + + delete prefix_extractor; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/block_test.cc b/table/block_test.cc new file mode 100644 index 00000000..fdba8e99 --- /dev/null +++ b/table/block_test.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "table/block_hash_index.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random *rnd) { + char buf[50]; + char *p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector *keys, + std::vector *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +class BlockTest {}; + +// block test +TEST(BlockTest, SimpleTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector keys; + std::vector values; + BlockBuilder builder(options, ic.get()); + int num_records = 100000; + + GenerateRandomKVs(&keys, &values, 0, num_records); + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + builder.Add(keys[i], values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + contents.heap_allocated = false; + Block reader(contents); + + // read contents of block sequentially + int count = 0; + Iterator* iter = reader.NewIterator(options.comparator); + for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) { + + // read kv from block + Slice k = iter->key(); + Slice v = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + ASSERT_EQ(v.ToString().compare(values[count]), 0); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator(options.comparator); + for (int i = 0; i < num_records; i++) { + + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + Slice v = iter->value(); + ASSERT_EQ(v.ToString().compare(values[index]), 0); + } + delete iter; +} + +// return the block contents +BlockContents GetBlockContents(std::unique_ptr *builder, + const std::vector &keys, + const std::vector &values, + const int prefix_group_size = 1) { + builder->reset( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())); + + // Add only half of the keys + for (size_t i = 0; i < keys.size(); ++i) { + (*builder)->Add(keys[i], values[i]); + } + Slice rawblock = (*builder)->Finish(); + + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + contents.heap_allocated = false; + + return contents; +} + +void CheckBlockContents(BlockContents contents, const int max_key, + const std::vector &keys, + const std::vector &values) { + const size_t prefix_size = 6; + // create block reader + Block reader1(contents); + Block reader2(contents); + + std::unique_ptr prefix_extractor( + NewFixedPrefixTransform(prefix_size)); + + { + auto iter1 = reader1.NewIterator(nullptr); + auto iter2 = reader1.NewIterator(nullptr); + reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(), + BytewiseComparator(), + prefix_extractor.get())); + + delete iter1; + delete iter2; + } + + std::unique_ptr hash_iter( + reader1.NewIterator(BytewiseComparator())); + + std::unique_ptr regular_iter( + reader2.NewIterator(BytewiseComparator())); + + // Seek existent keys + for (size_t i = 0; i < keys.size(); i++) { + hash_iter->Seek(keys[i]); + ASSERT_OK(hash_iter->status()); + ASSERT_TRUE(hash_iter->Valid()); + + Slice v = hash_iter->value(); + ASSERT_EQ(v.ToString().compare(values[i]), 0); + } + + // Seek non-existent keys. + // For hash index, if no key with a given prefix is not found, iterator will + // simply be set as invalid; whereas the binary search based iterator will + // return the one that is closest. + for (int i = 1; i < max_key - 1; i += 2) { + auto key = GenerateKey(i, 0, 0, nullptr); + hash_iter->Seek(key); + ASSERT_TRUE(!hash_iter->Valid()); + + regular_iter->Seek(key); + ASSERT_TRUE(regular_iter->Valid()); + } +} + +// In this test case, no two key share same prefix. +TEST(BlockTest, SimpleIndexHash) { + const int kMaxKey = 100000; + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0 /* first key id */, + kMaxKey /* last key id */, 2 /* step */, + 8 /* padding size (8 bytes randomly generated suffix) */); + + std::unique_ptr builder; + auto contents = GetBlockContents(&builder, keys, values); + + CheckBlockContents(contents, kMaxKey, keys, values); +} + +TEST(BlockTest, IndexHashWithSharedPrefix) { + const int kMaxKey = 100000; + // for each prefix, there will be 5 keys starts with it. + const int kPrefixGroup = 5; + std::vector keys; + std::vector values; + // Generate keys with same prefix. + GenerateRandomKVs(&keys, &values, 0, // first key id + kMaxKey, // last key id + 2, // step + 10, // padding size, + kPrefixGroup); + + std::unique_ptr builder; + auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); + + CheckBlockContents(contents, kMaxKey, keys, values); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/filter_block.cc b/table/filter_block.cc new file mode 100644 index 00000000..3651a7d0 --- /dev/null +++ b/table/filter_block.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "db/dbformat.h" +#include "rocksdb/filter_policy.h" +#include "util/coding.h" + +namespace rocksdb { + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +FilterBlockBuilder::FilterBlockBuilder(const Options& opt, + const Comparator* internal_comparator) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor.get()), + whole_key_filtering_(opt.whole_key_filtering), + comparator_(internal_comparator) {} + +void FilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +bool FilterBlockBuilder::SamePrefix(const Slice &key1, + const Slice &key2) const { + if (!prefix_extractor_->InDomain(key1) && + !prefix_extractor_->InDomain(key2)) { + return true; + } else if (!prefix_extractor_->InDomain(key1) || + !prefix_extractor_->InDomain(key2)) { + return false; + } else { + return (prefix_extractor_->Transform(key1) == + prefix_extractor_->Transform(key2)); + } +} + +void FilterBlockBuilder::AddKey(const Slice& key) { + // get slice for most recently added entry + Slice prev; + size_t added_to_start = 0; + + // add key to filter if needed + if (whole_key_filtering_) { + start_.push_back(entries_.size()); + ++added_to_start; + entries_.append(key.data(), key.size()); + } + + if (start_.size() > added_to_start) { + size_t prev_start = start_[start_.size() - 1 - added_to_start]; + const char* base = entries_.data() + prev_start; + size_t length = entries_.size() - prev_start; + prev = Slice(base, length); + } + + // add prefix to filter if needed + if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) { + // If prefix_extractor_, this filter_block layer assumes we only + // operate on internal keys. + Slice user_key = ExtractUserKey(key); + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || + !SamePrefix(user_key, ExtractUserKey(prev))) { + Slice prefix = prefix_extractor_->Transform(user_key); + InternalKey internal_prefix_tmp(prefix, 0, kTypeValue); + Slice internal_prefix = internal_prefix_tmp.Encode(); + start_.push_back(entries_.size()); + entries_.append(internal_prefix.data(), internal_prefix.size()); + } + } +} + +Slice FilterBlockBuilder::Finish() { + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = result_.size(); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void FilterBlockBuilder::GenerateFilter() { + const size_t num_entries = start_.size(); + if (num_entries == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(result_.size()); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; + size_t length = start_[i+1] - start_[i]; + tmp_entries_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(result_.size()); + policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_); + + tmp_entries_.clear(); + entries_.clear(); + start_.clear(); +} + +FilterBlockReader::FilterBlockReader( + const Options& opt, const Slice& contents, bool delete_contents_after_use) + : policy_(opt.filter_policy), + prefix_extractor_(opt.prefix_extractor.get()), + whole_key_filtering_(opt.whole_key_filtering), + data_(nullptr), + offset_(nullptr), + num_(0), + base_lg_(0) { + size_t n = contents.size(); + if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array + base_lg_ = contents[n-1]; + uint32_t last_word = DecodeFixed32(contents.data() + n - 5); + if (last_word > n - 5) return; + data_ = contents.data(); + offset_ = data_ + last_word; + num_ = (n - 5 - last_word) / 4; + if (delete_contents_after_use) { + filter_data.reset(contents.data()); + } +} + +bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, + const Slice& key) { + if (!whole_key_filtering_) { + return true; + } + return MayMatch(block_offset, key); +} + +bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, + const Slice& prefix) { + if (!prefix_extractor_) { + return true; + } + return MayMatch(block_offset, prefix); +} + +bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { + uint64_t index = block_offset >> base_lg_; + if (index < num_) { + uint32_t start = DecodeFixed32(offset_ + index*4); + uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); + if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { + Slice filter = Slice(data_ + start, limit - start); + return policy_->KeyMayMatch(entry, filter); + } else if (start == limit) { + // Empty filters do not match any entries + return false; + } + } + return true; // Errors are treated as potential matches +} + +} diff --git a/table/filter_block.h b/table/filter_block.h new file mode 100644 index 00000000..da19d42e --- /dev/null +++ b/table/filter_block.h @@ -0,0 +1,89 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "util/hash.h" + +namespace rocksdb { + +class FilterPolicy; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// (StartBlock AddKey*)* Finish +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder(const Options& opt, + const Comparator* internal_comparator); + + void StartBlock(uint64_t block_offset); + void AddKey(const Slice& key); + Slice Finish(); + + private: + bool SamePrefix(const Slice &key1, const Slice &key2) const; + void GenerateFilter(); + + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const Comparator* comparator_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&); + void operator=(const FilterBlockBuilder&); +}; + +class FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + FilterBlockReader( + const Options& opt, + const Slice& contents, + bool delete_contents_after_use = false); + bool KeyMayMatch(uint64_t block_offset, const Slice& key); + bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); + + private: + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + std::unique_ptr filter_data; + + + bool MayMatch(uint64_t block_offset, const Slice& entry); +}; + +} diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc new file mode 100644 index 00000000..1703d59d --- /dev/null +++ b/table/filter_block_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + TestHashFilter policy_; + Options options_; + + FilterBlockTest() { + options_ = Options(); + options_.filter_policy = &policy_; + } +}; + +TEST(FilterBlockTest, EmptyBuilder) { + FilterBlockBuilder builder(options_, options_.comparator); + Slice block = builder.Finish(); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); + FilterBlockReader reader(options_, block); + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); +} + +TEST(FilterBlockTest, SingleChunk) { + FilterBlockBuilder builder(options_, options_.comparator); + builder.StartBlock(100); + builder.AddKey("foo"); + builder.AddKey("bar"); + builder.AddKey("box"); + builder.StartBlock(200); + builder.AddKey("box"); + builder.StartBlock(300); + builder.AddKey("hello"); + Slice block = builder.Finish(); + FilterBlockReader reader(options_, block); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); + ASSERT_TRUE(reader.KeyMayMatch(100, "box")); + ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); + ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); + ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); +} + +TEST(FilterBlockTest, MultiChunk) { + FilterBlockBuilder builder(options_, options_.comparator); + + // First filter + builder.StartBlock(0); + builder.AddKey("foo"); + builder.StartBlock(2000); + builder.AddKey("bar"); + + // Second filter + builder.StartBlock(3100); + builder.AddKey("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.AddKey("box"); + builder.AddKey("hello"); + + Slice block = builder.Finish(); + FilterBlockReader reader(options_, block); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); + ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); + + // Check third filter (empty) + ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); + ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); + ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); + ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc new file mode 100644 index 00000000..4e223520 --- /dev/null +++ b/table/flush_block_policy.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/options.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/slice.h" +#include "table/block_builder.h" + +#include + +namespace rocksdb { + +// Flush block by size +class FlushBlockBySizePolicy : public FlushBlockPolicy { + public: + // @params block_size: Approximate size of user data packed per + // block. + // @params block_size_deviation: This is used to close a block before it + // reaches the configured + FlushBlockBySizePolicy(const uint64_t block_size, + const uint64_t block_size_deviation, + const BlockBuilder& data_block_builder) : + block_size_(block_size), + block_size_deviation_(block_size_deviation), + data_block_builder_(data_block_builder) { + } + + virtual bool Update(const Slice& key, + const Slice& value) override { + // it makes no sense to flush when the data block is empty + if (data_block_builder_.empty()) { + return false; + } + + auto curr_size = data_block_builder_.CurrentSizeEstimate(); + + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + return curr_size >= block_size_ || BlockAlmostFull(key, value); + } + + private: + bool BlockAlmostFull(const Slice& key, const Slice& value) const { + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + const auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + return + estimated_size_after > block_size_ && + block_size_deviation_ > 0 && + curr_size * 100 > block_size_ * (100 - block_size_deviation_); + } + + const uint64_t block_size_; + const uint64_t block_size_deviation_; + const BlockBuilder& data_block_builder_; +}; + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const Options& options, const BlockBuilder& data_block_builder) const { + return new FlushBlockBySizePolicy( + options.block_size, options.block_size_deviation, data_block_builder); +} + +} // namespace rocksdb diff --git a/table/format.cc b/table/format.cc new file mode 100644 index 00000000..f1adf97d --- /dev/null +++ b/table/format.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "table/block.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/perf_context_imp.h" + +namespace rocksdb { + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast(0)); + assert(size_ != ~static_cast(0)); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + return Status::OK(); + } else { + return Status::Corruption("bad block handle"); + } +} +const BlockHandle BlockHandle::kNullBlockHandle(0, 0); + +void Footer::EncodeTo(std::string* dst) const { +#ifndef NDEBUG + const size_t original_size = dst->size(); +#endif + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); + PutFixed32(dst, static_cast(table_magic_number() >> 32)); + assert(dst->size() == original_size + kEncodedLength); +} + +Status Footer::DecodeFrom(Slice* input) { + assert(input != nullptr); + assert(input->size() >= kEncodedLength); + + const char* magic_ptr = + input->data() + kEncodedLength - kMagicNumberLengthByte; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + const uint64_t magic = ((static_cast(magic_hi) << 32) | + (static_cast(magic_lo))); + if (HasInitializedTableMagicNumber()) { + if (magic != table_magic_number()) { + char buffer[80]; + snprintf(buffer, sizeof(buffer) - 1, + "not an sstable (bad magic number --- %lx)", + (long)magic); + return Status::InvalidArgument(buffer); + } + } else { + set_table_magic_number(magic); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + 8; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +Status ReadFooterFromFile(RandomAccessFile* file, + uint64_t file_size, + Footer* footer) { + if (file_size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + char footer_space[Footer::kEncodedLength]; + Slice footer_input; + Status s = file->Read(file_size - Footer::kEncodedLength, + Footer::kEncodedLength, + &footer_input, + footer_space); + if (!s.ok()) return s; + + // Check that we actually read the whole footer from the file. It may be + // that size isn't correct. + if (footer_input.size() != Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + return footer->DecodeFrom(&footer_input); +} + +Status ReadBlockContents(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + BlockContents* result, + Env* env, + bool do_uncompress) { + result->data = Slice(); + result->cachable = false; + result->heap_allocated = false; + + // Read the block contents as well as the type/crc footer. + // See table_builder.cc for the code that built this structure. + size_t n = static_cast(handle.size()); + char* buf = new char[n + kBlockTrailerSize]; + Slice contents; + + StopWatchNano timer(env); + StartPerfTimer(&timer); + Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + BumpPerfCount(&perf_context.block_read_count); + BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize); + BumpPerfTime(&perf_context.block_read_time, &timer); + + if (!s.ok()) { + delete[] buf; + return s; + } + if (contents.size() != n + kBlockTrailerSize) { + delete[] buf; + return Status::Corruption("truncated block read"); + } + + // Check the crc of the type and the block contents + const char* data = contents.data(); // Pointer to where Read put the data + if (options.verify_checksums) { + const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1)); + const uint32_t actual = crc32c::Value(data, n + 1); + if (actual != crc) { + delete[] buf; + s = Status::Corruption("block checksum mismatch"); + return s; + } + BumpPerfTime(&perf_context.block_checksum_time, &timer); + } + + // If the caller has requested that the block not be uncompressed + if (!do_uncompress || data[n] == kNoCompression) { + if (data != buf) { + // File implementation gave us pointer to some other data. + // Use it directly under the assumption that it will be live + // while the file is open. + delete[] buf; + result->data = Slice(data, n); + result->heap_allocated = false; + result->cachable = false; // Do not double-cache + } else { + result->data = Slice(buf, n); + result->heap_allocated = true; + result->cachable = true; + } + result->compression_type = (rocksdb::CompressionType)data[n]; + s = Status::OK(); + } else { + s = UncompressBlockContents(data, n, result); + delete[] buf; + } + BumpPerfTime(&perf_context.block_decompress_time, &timer); + return s; +} + +// +// The 'data' points to the raw block contents that was read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This +// buffer is returned via 'result' and it is upto the caller to +// free this buffer. +Status UncompressBlockContents(const char* data, size_t n, + BlockContents* result) { + char* ubuf = nullptr; + int decompress_size = 0; + assert(data[n] != kNoCompression); + switch (data[n]) { + case kSnappyCompression: { + size_t ulength = 0; + static char snappy_corrupt_msg[] = + "Snappy not supported or corrupted Snappy compressed block contents"; + if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { + return Status::Corruption(snappy_corrupt_msg); + } + ubuf = new char[ulength]; + if (!port::Snappy_Uncompress(data, n, ubuf)) { + delete[] ubuf; + return Status::Corruption(snappy_corrupt_msg); + } + result->data = Slice(ubuf, ulength); + result->heap_allocated = true; + result->cachable = true; + break; + } + case kZlibCompression: + ubuf = port::Zlib_Uncompress(data, n, &decompress_size); + static char zlib_corrupt_msg[] = + "Zlib not supported or corrupted Zlib compressed block contents"; + if (!ubuf) { + return Status::Corruption(zlib_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kBZip2Compression: + ubuf = port::BZip2_Uncompress(data, n, &decompress_size); + static char bzip2_corrupt_msg[] = + "Bzip2 not supported or corrupted Bzip2 compressed block contents"; + if (!ubuf) { + return Status::Corruption(bzip2_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kLZ4Compression: + ubuf = port::LZ4_Uncompress(data, n, &decompress_size); + static char lz4_corrupt_msg[] = + "LZ4 not supported or corrupted LZ4 compressed block contents"; + if (!ubuf) { + return Status::Corruption(lz4_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + case kLZ4HCCompression: + ubuf = port::LZ4_Uncompress(data, n, &decompress_size); + static char lz4hc_corrupt_msg[] = + "LZ4HC not supported or corrupted LZ4HC compressed block contents"; + if (!ubuf) { + return Status::Corruption(lz4hc_corrupt_msg); + } + result->data = Slice(ubuf, decompress_size); + result->heap_allocated = true; + result->cachable = true; + break; + default: + return Status::Corruption("bad block type"); + } + result->compression_type = kNoCompression; // not compressed any more + return Status::OK(); +} + +} // namespace rocksdb diff --git a/table/format.h b/table/format.h new file mode 100644 index 00000000..f05fbf89 --- /dev/null +++ b/table/format.h @@ -0,0 +1,177 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +class Block; +class RandomAccessFile; +struct ReadOptions; + +// the length of the magic number in bytes. +const int kMagicNumberLengthByte = 8; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + BlockHandle(uint64_t offset, uint64_t size); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t offset) { offset_ = offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t size) { size_ = size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + // if the block handle's offset and size are both "0", we will view it + // as a null block handle that points to no where. + bool IsNull() const { + return offset_ == 0 && size_ == 0; + } + + static const BlockHandle& NullBlockHandle() { + return kNullBlockHandle; + } + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_ = 0; + uint64_t size_ = 0; + + static const BlockHandle kNullBlockHandle; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + // Constructs a footer without specifying its table magic number. + // In such case, the table magic number of such footer should be + // initialized via @ReadFooterFromFile(). + Footer() : Footer(kInvalidTableMagicNumber) {} + + // @table_magic_number serves two purposes: + // 1. Identify different types of the tables. + // 2. Help us to identify if a given file is a valid sst. + explicit Footer(uint64_t table_magic_number) + : table_magic_number_(table_magic_number) {} + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { + return index_handle_; + } + + void set_index_handle(const BlockHandle& h) { + index_handle_ = h; + } + + uint64_t table_magic_number() const { return table_magic_number_; } + + void EncodeTo(std::string* dst) const; + + // Set the current footer based on the input slice. If table_magic_number_ + // is not set (i.e., HasInitializedTableMagicNumber() is true), then this + // function will also initialize table_magic_number_. Otherwise, this + // function will verify whether the magic number specified in the input + // slice matches table_magic_number_ and update the current footer only + // when the test passes. + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a + // Footer will always occupy exactly this many bytes. It consists + // of two block handles and a magic number. + enum { + kEncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8 + }; + + static const uint64_t kInvalidTableMagicNumber = 0; + + private: + // REQUIRES: magic number wasn't initialized. + void set_table_magic_number(uint64_t magic_number) { + assert(!HasInitializedTableMagicNumber()); + table_magic_number_ = magic_number; + } + + // return true if @table_magic_number_ is set to a value different + // from @kInvalidTableMagicNumber. + bool HasInitializedTableMagicNumber() const { + return (table_magic_number_ != kInvalidTableMagicNumber); + } + + BlockHandle metaindex_handle_; + BlockHandle index_handle_; + uint64_t table_magic_number_ = 0; +}; + +// Read the footer from file +Status ReadFooterFromFile(RandomAccessFile* file, + uint64_t file_size, + Footer* footer); + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +struct BlockContents { + Slice data; // Actual contents of data + bool cachable; // True iff data can be cached + bool heap_allocated; // True iff caller should delete[] data.data() + CompressionType compression_type; +}; + +// Read the block identified by "handle" from "file". On failure +// return non-OK. On success fill *result and return OK. +extern Status ReadBlockContents(RandomAccessFile* file, + const ReadOptions& options, + const BlockHandle& handle, + BlockContents* result, + Env* env, + bool do_uncompress); + +// The 'data' points to the raw block contents read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This buffer is +// returned via 'result' and it is upto the caller to +// free this buffer. +extern Status UncompressBlockContents(const char* data, + size_t n, + BlockContents* result); + +// Implementation details follow. Clients should ignore, + +inline BlockHandle::BlockHandle() + : BlockHandle(~static_cast(0), + ~static_cast(0)) { +} + +inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size) + : offset_(offset), + size_(size) { +} + +} // namespace rocksdb diff --git a/table/iter_heap.h b/table/iter_heap.h new file mode 100644 index 00000000..af8834e3 --- /dev/null +++ b/table/iter_heap.h @@ -0,0 +1,64 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include + +#include "rocksdb/comparator.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +// Return the max of two keys. +class MaxIteratorComparator { + public: + MaxIteratorComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) { + return comparator_->Compare(a->key(), b->key()) <= 0; + } + private: + const Comparator* comparator_; +}; + +// Return the max of two keys. +class MinIteratorComparator { + public: + // if maxHeap is set comparator returns the max value. + // else returns the min Value. + // Can use to create a minHeap or a maxHeap. + MinIteratorComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + private: + const Comparator* comparator_; +}; + +typedef std::priority_queue< + IteratorWrapper*, + std::vector, + MaxIteratorComparator> MaxIterHeap; + +typedef std::priority_queue< + IteratorWrapper*, + std::vector, + MinIteratorComparator> MinIterHeap; + +// Return's a new MaxHeap of IteratorWrapper's using the provided Comparator. +MaxIterHeap NewMaxIterHeap(const Comparator* comparator) { + return MaxIterHeap(MaxIteratorComparator(comparator)); +} + +// Return's a new MinHeap of IteratorWrapper's using the provided Comparator. +MinIterHeap NewMinIterHeap(const Comparator* comparator) { + return MinIterHeap(MinIteratorComparator(comparator)); +} + +} // namespace rocksdb diff --git a/table/iterator.cc b/table/iterator.cc new file mode 100644 index 00000000..a3d4f638 --- /dev/null +++ b/table/iterator.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/iterator.h" + +namespace rocksdb { + +Iterator::Iterator() { + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +Iterator::~Iterator() { + if (cleanup_.function != nullptr) { + (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2); + for (Cleanup* c = cleanup_.next; c != nullptr; ) { + (*c->function)(c->arg1, c->arg2); + Cleanup* next = c->next; + delete c; + c = next; + } + } +} + +void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != nullptr); + Cleanup* c; + if (cleanup_.function == nullptr) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +namespace { +class EmptyIterator : public Iterator { + public: + explicit EmptyIterator(const Status& s) : status_(s) { } + virtual bool Valid() const { return false; } + virtual void Seek(const Slice& target) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + virtual void Next() { assert(false); } + virtual void Prev() { assert(false); } + Slice key() const { assert(false); return Slice(); } + Slice value() const { assert(false); return Slice(); } + virtual Status status() const { return status_; } + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { + return new EmptyIterator(Status::OK()); +} + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +} // namespace rocksdb diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h new file mode 100644 index 00000000..cb8520be --- /dev/null +++ b/table/iterator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +namespace rocksdb { + +// A internal wrapper class with an interface similar to Iterator that +// caches the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +class IteratorWrapper { + public: + IteratorWrapper(): iter_(nullptr), valid_(false) { } + explicit IteratorWrapper(Iterator* iter): iter_(nullptr) { + Set(iter); + } + ~IteratorWrapper() { delete iter_; } + Iterator* iter() const { return iter_; } + + // Takes ownership of "iter" and will delete it when destroyed, or + // when Set() is invoked again. + void Set(Iterator* iter) { + delete iter_; + iter_ = iter; + if (iter_ == nullptr) { + valid_ = false; + } else { + Update(); + } + } + + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + Slice value() const { assert(Valid()); return iter_->value(); } + // Methods below require iter() != nullptr + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + } + } + + Iterator* iter_; + bool valid_; + Slice key_; +}; + +} // namespace rocksdb diff --git a/table/merger.cc b/table/merger.cc new file mode 100644 index 00000000..c154e6e6 --- /dev/null +++ b/table/merger.cc @@ -0,0 +1,287 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merger.h" + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/iter_heap.h" +#include "table/iterator_wrapper.h" +#include "util/stop_watch.h" +#include "util/perf_context_imp.h" + +#include + +namespace rocksdb { + +namespace { + +class MergingIterator : public Iterator { + public: + MergingIterator(Env* const env, const Comparator* comparator, + Iterator** children, int n) + : comparator_(comparator), + children_(n), + current_(nullptr), + use_heap_(true), + env_(env), + direction_(kForward), + maxHeap_(NewMaxIterHeap(comparator_)), + minHeap_ (NewMinIterHeap(comparator_)) { + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + for (auto& child : children_) { + if (child.Valid()) { + minHeap_.push(&child); + } + } + } + + virtual ~MergingIterator() { } + + virtual bool Valid() const { + return (current_ != nullptr); + } + + virtual void SeekToFirst() { + ClearHeaps(); + for (auto& child : children_) { + child.SeekToFirst(); + if (child.Valid()) { + minHeap_.push(&child); + } + } + FindSmallest(); + direction_ = kForward; + } + + virtual void SeekToLast() { + ClearHeaps(); + for (auto& child : children_) { + child.SeekToLast(); + if (child.Valid()) { + maxHeap_.push(&child); + } + } + FindLargest(); + direction_ = kReverse; + } + + virtual void Seek(const Slice& target) { + // Invalidate the heap. + use_heap_ = false; + IteratorWrapper* first_child = nullptr; + StopWatchNano child_seek_timer(env_, false); + StopWatchNano min_heap_timer(env_, false); + for (auto& child : children_) { + StartPerfTimer(&child_seek_timer); + child.Seek(target); + BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer); + BumpPerfCount(&perf_context.seek_child_seek_count); + + if (child.Valid()) { + // This child has valid key + if (!use_heap_) { + if (first_child == nullptr) { + // It's the first child has valid key. Only put it int + // current_. Now the values in the heap should be invalid. + first_child = &child; + } else { + // We have more than one children with valid keys. Initialize + // the heap and put the first child into the heap. + StartPerfTimer(&min_heap_timer); + ClearHeaps(); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + StartPerfTimer(&min_heap_timer); + minHeap_.push(first_child); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + } + } + if (use_heap_) { + StartPerfTimer(&min_heap_timer); + minHeap_.push(&child); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + } + } + } + if (use_heap_) { + // If heap is valid, need to put the smallest key to curent_. + StartPerfTimer(&min_heap_timer); + FindSmallest(); + BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + } else { + // The heap is not valid, then the current_ iterator is the first + // one, or null if there is no first child. + current_ = first_child; + } + direction_ = kForward; + } + + virtual void Next() { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current_ children since current_ is + // the smallest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kForward) { + ClearHeaps(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(key()); + if (child.Valid() && + comparator_->Compare(key(), child.key()) == 0) { + child.Next(); + } + if (child.Valid()) { + minHeap_.push(&child); + } + } + } + direction_ = kForward; + } + + // as the current points to the current record. move the iterator forward. + // and if it is valid add it to the heap. + current_->Next(); + if (use_heap_) { + if (current_->Valid()) { + minHeap_.push(current_); + } + FindSmallest(); + } else if (!current_->Valid()) { + current_ = nullptr; + } + } + + virtual void Prev() { + assert(Valid()); + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current_ children since current_ is + // the largest child and key() == current_->key(). Otherwise, + // we explicitly position the non-current_ children. + if (direction_ != kReverse) { + ClearHeaps(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(key()); + if (child.Valid()) { + // Child is at first entry >= key(). Step back one to be < key() + child.Prev(); + } else { + // Child has no entries >= key(). Position at last entry. + child.SeekToLast(); + } + if (child.Valid()) { + maxHeap_.push(&child); + } + } + } + direction_ = kReverse; + } + + current_->Prev(); + if (current_->Valid()) { + maxHeap_.push(current_); + } + FindLargest(); + } + + virtual Slice key() const { + assert(Valid()); + return current_->key(); + } + + virtual Slice value() const { + assert(Valid()); + return current_->value(); + } + + virtual Status status() const { + Status status; + for (auto& child : children_) { + status = child.status(); + if (!status.ok()) { + break; + } + } + return status; + } + + private: + void FindSmallest(); + void FindLargest(); + void ClearHeaps(); + + const Comparator* comparator_; + std::vector children_; + IteratorWrapper* current_; + // If the value is true, both of iterators in the heap and current_ + // contain valid rows. If it is false, only current_ can possibly contain + // valid rows. + // This flag is always true for reverse direction, as we always use heap for + // the reverse iterating case. + bool use_heap_; + Env* const env_; + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; + MaxIterHeap maxHeap_; + MinIterHeap minHeap_; +}; + +void MergingIterator::FindSmallest() { + assert(use_heap_); + if (minHeap_.empty()) { + current_ = nullptr; + } else { + current_ = minHeap_.top(); + assert(current_->Valid()); + minHeap_.pop(); + } +} + +void MergingIterator::FindLargest() { + assert(use_heap_); + if (maxHeap_.empty()) { + current_ = nullptr; + } else { + current_ = maxHeap_.top(); + assert(current_->Valid()); + maxHeap_.pop(); + } +} + +void MergingIterator::ClearHeaps() { + use_heap_ = true; + maxHeap_ = NewMaxIterHeap(comparator_); + minHeap_ = NewMinIterHeap(comparator_); +} +} // namespace + +Iterator* NewMergingIterator(Env* const env, const Comparator* cmp, + Iterator** list, int n) { + assert(n >= 0); + if (n == 0) { + return NewEmptyIterator(); + } else if (n == 1) { + return list[0]; + } else { + return new MergingIterator(env, cmp, list, n); + } +} + +} // namespace rocksdb diff --git a/table/merger.h b/table/merger.h new file mode 100644 index 00000000..ea8daa77 --- /dev/null +++ b/table/merger.h @@ -0,0 +1,30 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +namespace rocksdb { + +class Comparator; +class Iterator; +class Env; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern Iterator* NewMergingIterator(Env* const env, + const Comparator* comparator, + Iterator** children, int n); + +} // namespace rocksdb diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc new file mode 100644 index 00000000..4e940b97 --- /dev/null +++ b/table/meta_blocks.cc @@ -0,0 +1,297 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include "table/meta_blocks.h" + +#include +#include + +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block.h" +#include "table/format.h" +#include "util/coding.h" + +namespace rocksdb { + +MetaIndexBuilder::MetaIndexBuilder() + : meta_index_block_( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { +} + +void MetaIndexBuilder::Add(const std::string& key, + const BlockHandle& handle) { + std::string handle_encoding; + handle.EncodeTo(&handle_encoding); + meta_block_handles_.insert({key, handle_encoding}); +} + +Slice MetaIndexBuilder::Finish() { + for (const auto& metablock : meta_block_handles_) { + meta_index_block_->Add(metablock.first, metablock.second); + } + return meta_index_block_->Finish(); +} + +PropertyBlockBuilder::PropertyBlockBuilder() + : properties_block_( + new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { +} + +void PropertyBlockBuilder::Add(const std::string& name, + const std::string& val) { + props_.insert({name, val}); +} + +void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { + assert(props_.find(name) == props_.end()); + + std::string dst; + PutVarint64(&dst, val); + + Add(name, dst); +} + +void PropertyBlockBuilder::Add( + const UserCollectedProperties& user_collected_properties) { + for (const auto& prop : user_collected_properties) { + Add(prop.first, prop.second); + } +} + +void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { + Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); + Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); + Add(TablePropertiesNames::kDataSize, props.data_size); + Add(TablePropertiesNames::kIndexSize, props.index_size); + Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); + Add(TablePropertiesNames::kFilterSize, props.filter_size); + Add(TablePropertiesNames::kFormatVersion, props.format_version); + Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); + + if (!props.filter_policy_name.empty()) { + Add(TablePropertiesNames::kFilterPolicy, + props.filter_policy_name); + } +} + +Slice PropertyBlockBuilder::Finish() { + for (const auto& prop : props_) { + properties_block_->Add(prop.first, prop.second); + } + + return properties_block_->Finish(); +} + +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name) { + assert(method == "Add" || method == "Finish"); + + std::string msg = + "[Warning] encountered error when calling TablePropertiesCollector::" + + method + "() with collector name: " + name; + Log(info_log, "%s", msg.c_str()); +} + +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, + const Slice& value, + const Options::TablePropertiesCollectors& collectors, + Logger* info_log) { + bool all_succeeded = true; + for (auto collector : collectors) { + Status s = collector->Add(key, value); + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Add" /* method */, + collector->Name()); + } + } + return all_succeeded; +} + +bool NotifyCollectTableCollectorsOnFinish( + const Options::TablePropertiesCollectors& collectors, + Logger* info_log, + PropertyBlockBuilder* builder) { + bool all_succeeded = true; + for (auto collector : collectors) { + UserCollectedProperties user_collected_properties; + Status s = collector->Finish(&user_collected_properties); + + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Finish" /* method */, + collector->Name()); + } else { + builder->Add(user_collected_properties); + } + } + + return all_succeeded; +} + +Status ReadProperties(const Slice& handle_value, RandomAccessFile* file, + Env* env, Logger* logger, + TableProperties** table_properties) { + assert(table_properties); + + Slice v = handle_value; + BlockHandle handle; + if (!handle.DecodeFrom(&v).ok()) { + return Status::InvalidArgument("Failed to decode properties block handle"); + } + + BlockContents block_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + Status s = ReadBlockContents(file, read_options, handle, &block_contents, env, + false); + + if (!s.ok()) { + return s; + } + + Block properties_block(block_contents); + std::unique_ptr iter( + properties_block.NewIterator(BytewiseComparator())); + + auto new_table_properties = new TableProperties(); + // All pre-defined properties of type uint64_t + std::unordered_map predefined_uint64_properties = { + {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, + {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, + {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, + {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, + {TablePropertiesNames::kRawValueSize, + &new_table_properties->raw_value_size}, + {TablePropertiesNames::kNumDataBlocks, + &new_table_properties->num_data_blocks}, + {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kFormatVersion, + &new_table_properties->format_version}, + {TablePropertiesNames::kFixedKeyLen, + &new_table_properties->fixed_key_len}, }; + + std::string last_key; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); + if (!s.ok()) { + break; + } + + auto key = iter->key().ToString(); + // properties block is strictly sorted with no duplicate key. + assert(last_key.empty() || + BytewiseComparator()->Compare(key, last_key) > 0); + last_key = key; + + auto raw_val = iter->value(); + auto pos = predefined_uint64_properties.find(key); + + if (pos != predefined_uint64_properties.end()) { + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "[Warning] detect malformed value in properties meta-block:" + "\tkey: " + key + "\tval: " + raw_val.ToString(); + Log(logger, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == TablePropertiesNames::kFilterPolicy) { + new_table_properties->filter_policy_name = raw_val.ToString(); + } else { + // handle user-collected properties + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + } + if (s.ok()) { + *table_properties = new_table_properties; + } else { + delete new_table_properties; + } + + return s; +} + +Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + Logger* info_log, TableProperties** properties) { + // -- Read metaindex block + Footer footer(table_magic_number); + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; + } + + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + s = ReadBlockContents(file, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!s.ok()) { + return s; + } + Block metaindex_block(metaindex_contents); + std::unique_ptr meta_iter( + metaindex_block.NewIterator(BytewiseComparator())); + + // -- Read property block + // This function is not used by BlockBasedTable, so we don't have to + // worry about old properties block name. + meta_iter->Seek(kPropertiesBlock); + TableProperties table_properties; + if (meta_iter->Valid() && + meta_iter->key() == kPropertiesBlock && + meta_iter->status().ok()) { + s = ReadProperties(meta_iter->value(), file, env, info_log, properties); + } else { + s = Status::Corruption( + "Unable to read the property block from the plain table"); + } + + return s; +} + +Status ReadTableMagicNumber(const std::string& file_path, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number) { + unique_ptr file; + Status s = options.env->NewRandomAccessFile(file_path, &file, env_options); + if (!s.ok()) { + return s; + } + + uint64_t file_size; + options.env->GetFileSize(file_path, &file_size); + return ReadTableMagicNumber(file.get(), file_size, options, env_options, + table_magic_number); +} + +Status ReadTableMagicNumber(RandomAccessFile* file, uint64_t file_size, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number) { + if (file_size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + Footer footer; + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; + } + + *table_magic_number = footer.table_magic_number(); + return Status::OK(); +} + +} // namespace rocksdb diff --git a/table/meta_blocks.h b/table/meta_blocks.h new file mode 100644 index 00000000..a355905a --- /dev/null +++ b/table/meta_blocks.h @@ -0,0 +1,132 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include + +#include "db/builder.h" +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/table_properties.h" +#include "table/block_builder.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class Env; +class Logger; +class RandomAccessFile; +struct TableProperties; + +// An STL style comparator that does the bytewise comparator comparasion +// internally. +struct BytewiseLessThan { + bool operator()(const std::string& key1, const std::string& key2) const { + // smaller entries will be placed in front. + return comparator->Compare(key1, key2) <= 0; + } + + const Comparator* comparator = BytewiseComparator(); +}; + +// When writing to a block that requires entries to be sorted by +// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap` +// before writng to store. +typedef std::map BytewiseSortedMap; + +class MetaIndexBuilder { + public: + MetaIndexBuilder(const MetaIndexBuilder&) = delete; + MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete; + + MetaIndexBuilder(); + void Add(const std::string& key, const BlockHandle& handle); + + // Write all the added key/value pairs to the block and return the contents + // of the block. + Slice Finish(); + + private: + // store the sorted key/handle of the metablocks. + BytewiseSortedMap meta_block_handles_; + std::unique_ptr meta_index_block_; +}; + +class PropertyBlockBuilder { + public: + PropertyBlockBuilder(const PropertyBlockBuilder&) = delete; + PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete; + + PropertyBlockBuilder(); + + void AddTableProperty(const TableProperties& props); + void Add(const std::string& key, uint64_t value); + void Add(const std::string& key, const std::string& value); + void Add(const UserCollectedProperties& user_collected_properties); + + // Write all the added entries to the block and return the block contents + Slice Finish(); + + private: + std::unique_ptr properties_block_; + BytewiseSortedMap props_; +}; + +// Were we encounter any error occurs during user-defined statistics collection, +// we'll write the warning message to info log. +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name); + +// Utility functions help table builder to trigger batch events for user +// defined property collectors. +// Return value indicates if there is any error occurred; if error occurred, +// the warning message will be logged. +// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all +// property collectors. +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, + const Slice& value, + const Options::TablePropertiesCollectors& collectors, + Logger* info_log); + +// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all +// property collectors. The collected properties will be added to `builder`. +bool NotifyCollectTableCollectorsOnFinish( + const Options::TablePropertiesCollectors& collectors, + Logger* info_log, + PropertyBlockBuilder* builder); + +// Read the properties from the table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadProperties(const Slice& handle_value, RandomAccessFile* file, + Env* env, Logger* logger, + TableProperties** table_properties); + +// Directly read the properties from the properties block of a plain table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + Logger* info_log, TableProperties** properties); + +// Read the magic number of the specified file directly. The magic number +// of a valid sst table the last 8-byte of the file. +Status ReadTableMagicNumber(const std::string& file_path, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number); + +Status ReadTableMagicNumber(RandomAccessFile* file, uint64_t file_size, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number); +} // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc new file mode 100644 index 00000000..fad7b455 --- /dev/null +++ b/table/plain_table_builder.cc @@ -0,0 +1,207 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain_table_builder.h" + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "table/plain_table_factory.h" +#include "db/dbformat.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +namespace { + +// a utility that helps writing block content to the file +// @offset will advance if @block_contents was successfully written. +// @block_handle the block handle this particular block. +Status WriteBlock( + const Slice& block_contents, + WritableFile* file, + uint64_t* offset, + BlockHandle* block_handle) { + block_handle->set_offset(*offset); + block_handle->set_size(block_contents.size()); + Status s = file->Append(block_contents); + + if (s.ok()) { + *offset += block_contents.size(); + } + return s; +} + +} // namespace + +// kPlainTableMagicNumber was picked by running +// echo rocksdb.plain.table | sha1sum +// and taking the leading 64 bits. +extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; + +PlainTableBuilder::PlainTableBuilder(const Options& options, + WritableFile* file, + uint32_t user_key_len) : + options_(options), file_(file), user_key_len_(user_key_len) { + properties_.fixed_key_len = user_key_len; + + // for plain table, we put all the data in a big chuck. + properties_.num_data_blocks = 1; + // emphasize that currently plain table doesn't have persistent index or + // filter block. + properties_.index_size = 0; + properties_.filter_size = 0; + properties_.format_version = 0; +} + +PlainTableBuilder::~PlainTableBuilder() { +} + +void PlainTableBuilder::Add(const Slice& key, const Slice& value) { + size_t user_key_size = key.size() - 8; + assert(user_key_len_ == 0 || user_key_size == user_key_len_); + + if (!IsFixedLength()) { + // Write key length + char key_size_buf[5]; // tmp buffer for key size as varint32 + char* ptr = EncodeVarint32(key_size_buf, user_key_size); + assert(ptr <= key_size_buf + sizeof(key_size_buf)); + auto len = ptr - key_size_buf; + file_->Append(Slice(key_size_buf, len)); + offset_ += len; + } + + // Write key + ParsedInternalKey parsed_key; + if (!ParseInternalKey(key, &parsed_key)) { + status_ = Status::Corruption(Slice()); + return; + } + // For value size as varint32 (up to 5 bytes). + // If the row is of value type with seqId 0, flush the special flag together + // in this buffer to safe one file append call, which takes 1 byte. + char value_size_buf[6]; + size_t value_size_buf_size = 0; + if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { + file_->Append(Slice(key.data(), user_key_size)); + offset_ += user_key_size; + value_size_buf[0] = PlainTableFactory::kValueTypeSeqId0; + value_size_buf_size = 1; + } else { + file_->Append(key); + offset_ += key.size(); + } + + // Write value length + int value_size = value.size(); + char* end_ptr = + EncodeVarint32(value_size_buf + value_size_buf_size, value_size); + assert(end_ptr <= value_size_buf + sizeof(value_size_buf)); + value_size_buf_size = end_ptr - value_size_buf; + file_->Append(Slice(value_size_buf, value_size_buf_size)); + + // Write value + file_->Append(value); + offset_ += value_size + value_size_buf_size; + + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + + // notify property collectors + NotifyCollectTableCollectorsOnAdd( + key, + value, + options_.table_properties_collectors, + options_.info_log.get() + ); +} + +Status PlainTableBuilder::status() const { return status_; } + +Status PlainTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + + properties_.data_size = offset_; + + // Write the following blocks + // 1. [meta block: properties] + // 2. [metaindex block] + // 3. [footer] + MetaIndexBuilder meta_index_builer; + + PropertyBlockBuilder property_block_builder; + // -- Add basic properties + property_block_builder.AddTableProperty(properties_); + + // -- Add user collected properties + NotifyCollectTableCollectorsOnFinish( + options_.table_properties_collectors, + options_.info_log.get(), + &property_block_builder + ); + + // -- Write property block + BlockHandle property_block_handle; + auto s = WriteBlock( + property_block_builder.Finish(), + file_, + &offset_, + &property_block_handle + ); + if (!s.ok()) { + return s; + } + meta_index_builer.Add(kPropertiesBlock, property_block_handle); + + // -- write metaindex block + BlockHandle metaindex_block_handle; + s = WriteBlock( + meta_index_builer.Finish(), + file_, + &offset_, + &metaindex_block_handle + ); + if (!s.ok()) { + return s; + } + + // Write Footer + Footer footer(kPlainTableMagicNumber); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(BlockHandle::NullBlockHandle()); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + s = file_->Append(footer_encoding); + if (s.ok()) { + offset_ += footer_encoding.size(); + } + + return s; +} + +void PlainTableBuilder::Abandon() { + closed_ = true; +} + +uint64_t PlainTableBuilder::NumEntries() const { + return properties_.num_entries; +} + +uint64_t PlainTableBuilder::FileSize() const { + return offset_; +} + +} // namespace rocksdb diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h new file mode 100644 index 00000000..4e0150cd --- /dev/null +++ b/table/plain_table_builder.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built +// as production quality. + +#pragma once +#include +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/table_builder.h" +#include "rocksdb/table_properties.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +class TableBuilder; + +class PlainTableBuilder: public TableBuilder { +public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + PlainTableBuilder(const Options& options, WritableFile* file, + uint32_t user_key_size); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~PlainTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + +private: + Options options_; + WritableFile* file_; + uint64_t offset_ = 0; + Status status_; + TableProperties properties_; + + const size_t user_key_len_; + bool closed_ = false; // Either Finish() or Abandon() has been called. + + bool IsFixedLength() const { + return user_key_len_ > 0; + } + + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; +}; + +} // namespace rocksdb + diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc new file mode 100644 index 00000000..16ee24eb --- /dev/null +++ b/table/plain_table_factory.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain_table_factory.h" + +#include +#include +#include "db/dbformat.h" +#include "table/plain_table_builder.h" +#include "table/plain_table_reader.h" +#include "port/port.h" + +namespace rocksdb { + +Status PlainTableFactory::NewTableReader(const Options& options, + const EnvOptions& soptions, + const InternalKeyComparator& icomp, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const { + return PlainTableReader::Open(options, soptions, icomp, std::move(file), + file_size, table, bloom_bits_per_key_, + hash_table_ratio_, index_sparseness_); +} + +TableBuilder* PlainTableFactory::NewTableBuilder( + const Options& options, const InternalKeyComparator& internal_comparator, + WritableFile* file, CompressionType compression_type) const { + return new PlainTableBuilder(options, file, user_key_len_); +} + +extern TableFactory* NewPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, + hash_table_ratio, index_sparseness); +} + +extern TableFactory* NewTotalOrderPlainTableFactory(uint32_t user_key_len, + int bloom_bits_per_key, + size_t index_sparseness) { + return new PlainTableFactory(user_key_len, bloom_bits_per_key, 0, + index_sparseness); +} + +} // namespace rocksdb diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h new file mode 100644 index 00000000..a0a7fbe6 --- /dev/null +++ b/table/plain_table_factory.h @@ -0,0 +1,85 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct Options; +struct EnvOptions; + +using std::unique_ptr; +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +// IndexedTable requires fixed length key, configured as a constructor +// parameter of the factory class. Output file format: +// +-------------+-----------------+ +// | version | user_key_length | +// +------------++------------------------------+ <= key1 offset +// | [key_size] | key1 | value_size | | +// +------------+-------------+-------------+ | +// | value1 | +// | | +// +----------------------------------------+---+ <= key2 offset +// | [key_size] | key2 | value_size | | +// +------------+-------------+-------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ +// If user_key_length = kPlainTableVariableLength, it means the key is variable +// length, there will be an extra field for key size encoded before every key. +class PlainTableFactory : public TableFactory { + public: + ~PlainTableFactory() {} + // user_key_size is the length of the user key. If it is set to be + // kPlainTableVariableLength, then it means variable length. Otherwise, all + // the keys need to have the fix length of this value. bloom_bits_per_key is + // number of bits used for bloom filer per key. hash_table_ratio is + // the desired utilization of the hash table used for prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. + explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength, + int bloom_bits_per_key = 0, + double hash_table_ratio = 0.75, + size_t index_sparseness = 16) + : user_key_len_(user_key_len), + bloom_bits_per_key_(bloom_bits_per_key), + hash_table_ratio_(hash_table_ratio), + index_sparseness_(index_sparseness) {} + const char* Name() const override { return "PlainTable"; } + Status NewTableReader(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder(const Options& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + CompressionType compression_type) const + override; + + static const char kValueTypeSeqId0 = 0xFF; + + private: + uint32_t user_key_len_; + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; +}; + +} // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc new file mode 100644 index 00000000..436d13bf --- /dev/null +++ b/table/plain_table_reader.cc @@ -0,0 +1,747 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/plain_table_reader.h" + +#include +#include + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" + +#include "table/block.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/two_level_iterator.h" +#include "table/plain_table_factory.h" + +#include "util/coding.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/histogram.h" +#include "util/murmurhash.h" +#include "util/perf_context_imp.h" +#include "util/stop_watch.h" + + +namespace rocksdb { + +namespace { + +inline uint32_t GetSliceHash(const Slice& s) { + return Hash(s.data(), s.size(), 397) ; +} + +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + return hash % num_buckets; +} + +// Safely getting a uint32_t element from a char array, where, starting from +// `base`, every 4 bytes are considered as an fixed 32 bit integer. +inline uint32_t GetFixed32Element(const char* base, size_t offset) { + return DecodeFixed32(base + offset * sizeof(uint32_t)); +} + +} // namespace + +// Iterator to iterate IndexedTable +class PlainTableIterator : public Iterator { + public: + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); + ~PlainTableIterator(); + + bool Valid() const; + + void SeekToFirst(); + + void SeekToLast(); + + void Seek(const Slice& target); + + void Next(); + + void Prev(); + + Slice key() const; + + Slice value() const; + + Status status() const; + + private: + PlainTableReader* table_; + bool use_prefix_seek_; + uint32_t offset_; + uint32_t next_offset_; + IterKey key_; + Slice value_; + Status status_; + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +extern const uint64_t kPlainTableMagicNumber; +PlainTableReader::PlainTableReader( + const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, const InternalKeyComparator& icomparator, + uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, const TableProperties* table_properties) + : options_(options), + soptions_(storage_options), + file_(std::move(file)), + internal_comparator_(icomparator), + file_size_(file_size), + kHashTableRatio(hash_table_ratio), + kBloomBitsPerKey(bloom_bits_per_key), + kIndexIntervalForSamePrefixKeys(index_sparseness), + table_properties_(table_properties), + data_end_offset_(table_properties_->data_size), + user_key_len_(table_properties->fixed_key_len) { + assert(kHashTableRatio >= 0.0); +} + +PlainTableReader::~PlainTableReader() { +} + +Status PlainTableReader::Open( + const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { + assert(options.allow_mmap_reads); + + if (file_size > kMaxFileSize) { + return Status::NotSupported("File is too large for PlainTableReader!"); + } + + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + options.env, options.info_log.get(), &props); + if (!s.ok()) { + return s; + } + + std::unique_ptr new_reader(new PlainTableReader( + options, std::move(file), soptions, internal_comparator, file_size, + bloom_bits_per_key, hash_table_ratio, index_sparseness, props)); + + // -- Populate Index + s = new_reader->PopulateIndex(); + if (!s.ok()) { + return s; + } + + *table_reader = std::move(new_reader); + return s; +} + +void PlainTableReader::SetupForCompaction() { +} + +bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) { + return true; +} + +Iterator* PlainTableReader::NewIterator(const ReadOptions& options) { + return new PlainTableIterator(this, options.prefix_seek); +} + +struct PlainTableReader::IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; +}; + +// Helper class to track all the index records +class PlainTableReader::IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(murmur_t hash, uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; + } + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector groups_; + size_t num_records_in_current_group_; +}; + +Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes) const { + Slice prev_key_prefix_slice; + uint32_t prev_key_prefix_hash = 0; + uint32_t pos = data_start_offset_; + int num_keys_per_prefix = 0; + bool is_first_record = true; + HistogramImpl keys_per_prefix_hist; + // Need map to be ordered to make sure sub indexes generated + // are in order. + + *num_prefixes = 0; + while (pos < data_end_offset_) { + uint32_t key_offset = pos; + ParsedInternalKey key; + Slice value_slice; + Status s = Next(&pos, &key, &value_slice); + if (!s.ok()) { + return s; + } + if (bloom_) { + // total order mode and bloom filter is enabled. + bloom_->AddHash(GetSliceHash(key.user_key)); + } + Slice key_prefix_slice = GetPrefix(key); + + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + ++(*num_prefixes); + if (!is_first_record) { + keys_per_prefix_hist.Add(num_keys_per_prefix); + } + num_keys_per_prefix = 0; + prev_key_prefix_slice = key_prefix_slice; + prev_key_prefix_hash = GetSliceHash(key_prefix_slice); + } + + if (kIndexIntervalForSamePrefixKeys == 0 || + num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list->AddRecord(prev_key_prefix_hash, key_offset); + } + is_first_record = false; + } + + keys_per_prefix_hist.Add(num_keys_per_prefix); + Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist.ToString().c_str()); + + return Status::OK(); +} + +void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) { + index_.reset(); + + if (options_.prefix_extractor.get() != nullptr) { + uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey; + if (bloom_total_bits > 0) { + bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality)); + } + } + + if (options_.prefix_extractor.get() == nullptr || kHashTableRatio <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / kHashTableRatio; + index_size_ = num_prefixes * hash_table_size_multipier + 1; + } + index_.reset(new uint32_t[index_size_]); +} + +size_t PlainTableReader::BucketizeIndexesAndFillBloom( + IndexRecordList* record_list, std::vector* hash_to_offsets, + std::vector* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list->GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list->At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + if (bloom_ && !IsTotalOrderMode()) { + bloom_->AddHash(cur_hash); + } + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + size_t sub_index_size = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size += entry_count * kOffsetLen; + } + return sub_index_size; +} + +void PlainTableReader::FillIndexes( + const size_t kSubIndexSize, + const std::vector& hash_to_offsets, + const std::vector& entries_per_bucket) { + Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", + kSubIndexSize); + sub_index_.reset(new char[kSubIndexSize]); + size_t sub_index_offset = 0; + for (int i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + index_[i] = data_end_offset_; + break; + case 1: + // point directly to the file offset + index_[i] = hash_to_offsets[i]->offset; + break; + default: + // point to second level indexes. + index_[i] = sub_index_offset | kSubIndexMask; + char* prev_ptr = &sub_index_[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += (cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index_[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= kSubIndexSize); + break; + } + } + assert(sub_index_offset == kSubIndexSize); + + Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + index_size_, kSubIndexSize); +} + +Status PlainTableReader::PopulateIndex() { + // options.prefix_extractor is requried for a hash-based look-up. + if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) { + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + + // Get mmapped memory to file_data_. + Status s = file_->Read(0, file_size_, &file_data_, nullptr); + if (!s.ok()) { + return s; + } + + IndexRecordList record_list(kRecordsPerGroup); + // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows + // for a prefix (starting from the first one), generate a record of (hash, + // offset) and append it to IndexRecordList, which is a data structure created + // to store them. + int num_prefixes; + + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey; + if (num_bloom_bits > 0) { + bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality)); + } + } + + s = PopulateIndexRecordList(&record_list, &num_prefixes); + if (!s.ok()) { + return s; + } + // Calculated hash table and bloom filter size and allocate memory for indexes + // and bloom filter based on the number of prefixes. + AllocateIndexAndBloom(num_prefixes); + + // Bucketize all the index records to a temp data structure, in which for + // each bucket, we generate a linked list of IndexRecord, in reversed order. + std::vector hash_to_offsets(index_size_, nullptr); + std::vector entries_per_bucket(index_size_, 0); + size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( + &record_list, &hash_to_offsets, &entries_per_bucket); + // From the temp data structure, populate indexes. + FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket); + + return Status::OK(); +} + +Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const { + prefix_matched = false; + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + uint32_t bucket_value = index_[bucket]; + if (bucket_value == data_end_offset_) { + *offset = data_end_offset_; + return Status::OK(); + } else if ((bucket_value & kSubIndexMask) == 0) { + // point directly to the file + *offset = bucket_value; + return Status::OK(); + } + + // point to sub-index, need to do a binary search + uint32_t low = 0; + uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask; + + const char* index_ptr = &sub_index_[prefix_index_offset]; + uint32_t upper_bound = 0; + const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound); + uint32_t high = upper_bound; + ParsedInternalKey mid_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + // The key is between [low, high). Do a binary search between it. + while (high - low > 1) { + uint32_t mid = (high + low) / 2; + uint32_t file_offset = GetFixed32Element(base_ptr, mid); + size_t tmp; + Status s = ReadKey(file_data_.data() + file_offset, &mid_key, &tmp); + if (!s.ok()) { + return s; + } + int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); + if (cmp_result < 0) { + low = mid; + } else { + if (cmp_result == 0) { + // Happen to have found the exact key or target is smaller than the + // first key after base_offset. + prefix_matched = true; + *offset = file_offset; + return Status::OK(); + } else { + high = mid; + } + } + } + // Both of the key at the position low or low+1 could share the same + // prefix as target. We need to rule out one of them to avoid to go + // to the wrong prefix. + ParsedInternalKey low_key; + size_t tmp; + uint32_t low_key_offset = GetFixed32Element(base_ptr, low); + Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, &tmp); + if (GetPrefix(low_key) == prefix) { + prefix_matched = true; + *offset = low_key_offset; + } else if (low + 1 < upper_bound) { + // There is possible a next prefix, return it + prefix_matched = false; + *offset = GetFixed32Element(base_ptr, low + 1); + } else { + // target is larger than a key of the last prefix in this bucket + // but with a different prefix. Key does not exist. + *offset = data_end_offset_; + } + return Status::OK(); +} + +bool PlainTableReader::MatchBloom(uint32_t hash) const { + return bloom_.get() == nullptr || bloom_->MayContainHash(hash); +} + +Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); +} + +Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key, + size_t* bytes_read) const { + const char* key_ptr = nullptr; + *bytes_read = 0; + size_t user_key_size = 0; + if (IsFixedLength()) { + user_key_size = user_key_len_; + key_ptr = start; + } else { + uint32_t tmp_size = 0; + key_ptr = + GetVarint32Ptr(start, file_data_.data() + data_end_offset_, &tmp_size); + if (key_ptr == nullptr) { + return Status::Corruption( + "Unexpected EOF when reading the next key's size"); + } + user_key_size = (size_t)tmp_size; + *bytes_read = key_ptr - start; + } + if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) { + return Status::Corruption("Unexpected EOF when reading the next key"); + } + + if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) { + // Special encoding for the row with seqID=0 + key->user_key = Slice(key_ptr, user_key_size); + key->sequence = 0; + key->type = kTypeValue; + *bytes_read += user_key_size + 1; + } else { + if (start + user_key_size + 8 >= file_data_.data() + data_end_offset_) { + return Status::Corruption( + "Unexpected EOF when reading internal bytes of the next key"); + } + if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) { + return Status::Corruption( + Slice("Incorrect value type found when reading the next key")); + } + *bytes_read += user_key_size + 8; + } + + return Status::OK(); +} + +Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key, + Slice* value) const { + if (*offset == data_end_offset_) { + *offset = data_end_offset_; + return Status::OK(); + } + + if (*offset > data_end_offset_) { + return Status::Corruption("Offset is out of file size"); + } + + const char* start = file_data_.data() + *offset; + size_t bytes_for_key; + Status s = ReadKey(start, key, &bytes_for_key); + if (!s.ok()) { + return s; + } + uint32_t value_size; + const char* value_ptr = GetVarint32Ptr( + start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size); + if (value_ptr == nullptr) { + return Status::Corruption( + "Unexpected EOF when reading the next value's size."); + } + *offset = *offset + (value_ptr - start) + value_size; + if (*offset > data_end_offset_) { + return Status::Corruption("Unexpected EOF when reading the next value. "); + } + *value = Slice(value_ptr, value_size); + + return Status::OK(); +} + +Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, + void* arg, + bool (*saver)(void*, const ParsedInternalKey&, + const Slice&, bool), + void (*mark_key_may_exist)(void*)) { + // Check bloom filter first. + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } + } + uint32_t offset; + bool prefix_match; + Status s = + GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset); + if (!s.ok()) { + return s; + } + ParsedInternalKey found_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + Slice found_value; + while (offset < data_end_offset_) { + Status s = Next(&offset, &found_key, &found_value); + if (!s.ok()) { + return s; + } + if (!prefix_match) { + // Need to verify prefix for the first key found if it is not yet + // checked. + if (GetPrefix(found_key) != prefix_slice) { + return Status::OK(); + } + prefix_match = true; + } + if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { + if (!(*saver)(arg, found_key, found_value, true)) { + break; + } + } + } + return Status::OK(); +} + +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { + return 0; +} + +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), use_prefix_seek_(use_prefix_seek) { + next_offset_ = offset_ = table_->data_end_offset_; +} + +PlainTableIterator::~PlainTableIterator() { +} + +bool PlainTableIterator::Valid() const { + return offset_ < table_->data_end_offset_ + && offset_ >= table_->data_start_offset_; +} + +void PlainTableIterator::SeekToFirst() { + next_offset_ = table_->data_start_offset_; + if (next_offset_ >= table_->data_end_offset_) { + next_offset_ = offset_ = table_->data_end_offset_; + } else { + Next(); + } +} + +void PlainTableIterator::SeekToLast() { + assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); +} + +void PlainTableIterator::Seek(const Slice& target) { + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (!use_prefix_seek_ && table_->index_size_ > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order mode."); + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash = 0; + // Bloom filter is ignored in total-order mode. + if (!table_->IsTotalOrderMode()) { + prefix_hash = GetSliceHash(prefix_slice); + if (!table_->MatchBloom(prefix_hash)) { + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + } + bool prefix_match; + status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match, + &next_offset_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->data_end_offset_; + return; + } + + if (next_offset_ < table_-> data_end_offset_) { + for (Next(); status_.ok() && Valid(); Next()) { + if (!prefix_match) { + // Need to verify the first key's prefix + if (table_->GetPrefix(key()) != prefix_slice) { + offset_ = next_offset_ = table_->data_end_offset_; + break; + } + prefix_match = true; + } + if (table_->internal_comparator_.Compare(key(), target) >= 0) { + break; + } + } + } else { + offset_ = table_->data_end_offset_; + } +} + +void PlainTableIterator::Next() { + offset_ = next_offset_; + if (offset_ < table_->data_end_offset_) { + Slice tmp_slice; + ParsedInternalKey parsed_key; + status_ = table_->Next(&next_offset_, &parsed_key, &value_); + if (status_.ok()) { + // Make a copy in this case. TODO optimize. + key_.SetInternalKey(parsed_key); + } else { + offset_ = next_offset_ = table_->data_end_offset_; + } + } +} + +void PlainTableIterator::Prev() { + assert(false); +} + +Slice PlainTableIterator::key() const { + assert(Valid()); + return key_.GetKey(); +} + +Slice PlainTableIterator::value() const { + assert(Valid()); + return value_; +} + +Status PlainTableIterator::status() const { + return status_; +} + +} // namespace rocksdb diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h new file mode 100644 index 00000000..ac2cb874 --- /dev/null +++ b/table/plain_table_reader.h @@ -0,0 +1,257 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_reader.h" +#include "table/plain_table_factory.h" + +namespace rocksdb { + +class Block; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class DynamicBloom; +class InternalKeyComparator; + +using std::unique_ptr; +using std::unordered_map; +extern const uint32_t kPlainTableVariableLength; + +// Based on following output file format shown in plain_table_factory.h +// When opening the output file, IndexedTableReader creates a hash table +// from key prefixes to offset of the output file. IndexedTable will decide +// whether it points to the data offset of the first key with the key prefix +// or the offset of it. If there are too many keys share this prefix, it will +// create a binary search-able index from the suffix to offset on disk. +// +// The implementation of IndexedTableReader requires output file is mmaped +class PlainTableReader: public TableReader { + public: + static Status Open(const Options& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table, + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness); + + bool PrefixMayMatch(const Slice& internal_prefix); + + Iterator* NewIterator(const ReadOptions&); + + Status Get(const ReadOptions&, const Slice& key, void* arg, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr); + + uint64_t ApproximateOffsetOf(const Slice& key); + + void SetupForCompaction(); + + std::shared_ptr GetTableProperties() const { + return table_properties_; + } + + PlainTableReader(const Options& options, unique_ptr&& file, + const EnvOptions& storage_options, + const InternalKeyComparator& internal_comparator, + uint64_t file_size, int bloom_num_bits, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // index_ contains buckets size of index_size_, each is a + // 32-bit integer. The lower 31 bits contain an offset value (explained below) + // and the first bit of the integer indicates type of the offset. + // + // +--------------+------------------------------------------------------+ + // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + + // +--------------+------------------------------------------------------+ + // + // Explanation for the "flag bit": + // + // 0 indicates that the bucket contains only one prefix (no conflict when + // hashing this prefix), whose first row starts from this offset of the + // file. + // 1 indicates that the bucket contains more than one prefixes, or there + // are too many rows for one prefix so we need a binary search for it. In + // this case, the offset indicates the offset of sub_index_ holding the + // binary search indexes of keys for those rows. Those binary search indexes + // are organized in this way: + // + // The first 4 bytes, indicate how many indexes (N) are stored after it. After + // it, there are N 32-bit integers, each points of an offset of the file, + // which + // points to starting of a row. Those offsets need to be guaranteed to be in + // ascending order so the keys they are pointing to are also in ascending + // order + // to make sure we can use them to do binary searches. Below is visual + // presentation of a bucket. + // + // + // number_of_records: varint32 + // record 1 file offset: fixedint32 + // record 2 file offset: fixedint32 + // .... + // record N file offset: fixedint32 + // + Status PopulateIndex(); + + private: + struct IndexRecord; + class IndexRecordList; + + // Plain table maintains an index and a sub index. + // index is implemented by a hash table. + // subindex is a big of memory array. + // For more details about the in-memory index, please refer to: + // https://github.com/facebook/rocksdb/wiki/PlainTable-Format + // #wiki-in-memory-index-format + std::unique_ptr index_; + int index_size_ = 0; + std::unique_ptr sub_index_; + + Options options_; + const EnvOptions& soptions_; + unique_ptr file_; + + const InternalKeyComparator internal_comparator_; + // represents plain table's current status. + Status status_; + + Slice file_data_; + uint32_t file_size_; + + const double kHashTableRatio; + const int kBloomBitsPerKey; + // To speed up the search for keys with same prefix, we'll add index key for + // every N keys, where the "N" is determined by + // kIndexIntervalForSamePrefixKeys + const size_t kIndexIntervalForSamePrefixKeys = 16; + // Bloom filter is used to rule out non-existent key + unique_ptr bloom_; + + std::shared_ptr table_properties_; + // data_start_offset_ and data_end_offset_ defines the range of the + // sst file that stores data. + const uint32_t data_start_offset_ = 0; + const uint32_t data_end_offset_; + const size_t user_key_len_; + + static const size_t kNumInternalBytes = 8; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + static const uint64_t kMaxFileSize = 1u << 31; + static const size_t kRecordsPerGroup = 256; + + bool IsFixedLength() const { + return user_key_len_ != kPlainTableVariableLength; + } + + size_t GetFixedInternalKeyLength() const { + return user_key_len_ + kNumInternalBytes; + } + + friend class TableCache; + friend class PlainTableIterator; + + // Internal helper function to generate an IndexRecordList object from all + // the rows, which contains index records as a list. + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(IndexRecordList* record_list, + int* num_prefixes) const; + + // Internal helper function to allocate memory for indexes and bloom filters + void AllocateIndexAndBloom(int num_prefixes); + + // Internal helper function to bucket index record list to hash buckets. + // bucket_header is a vector of size hash_table_size_, with each entry + // containing a linklist of IndexRecord hashed to the same bucket, in reverse + // order. + // of offsets for the hash, in reversed order. + // entries_per_bucket is sized of index_size_. The value is how many index + // records are there in bucket_headers for the same bucket. + size_t BucketizeIndexesAndFillBloom( + IndexRecordList* record_list, std::vector* bucket_headers, + std::vector* entries_per_bucket); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. bucket_headers and entries_per_bucket are bucketized + // indexes and counts generated by BucketizeIndexesAndFillBloom(). + void FillIndexes(const size_t kSubIndexSize, + const std::vector& bucket_headers, + const std::vector& entries_per_bucket); + + // Read a plain table key from the position `start`. The read content + // will be written to `key` and the size of read bytes will be populated + // in `bytes_read`. + Status ReadKey(const char* row_ptr, ParsedInternalKey* key, + size_t* bytes_read) const; + // Read the key and value at `offset` to parameters `key` and `value`. + // On success, `offset` will be updated as the offset for the next key. + Status Next(uint32_t* offset, ParsedInternalKey* key, Slice* value) const; + // Get file offset for key target. + // return value prefix_matched is set to true if the offset is confirmed + // for a key with the same prefix as target. + Status GetOffset(const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const; + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + inline Slice GetPrefix(const ParsedInternalKey& target) const; + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return options_.prefix_extractor->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. In that case, + // it falls back to pure binary search and total iterator seek is + // supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { + return (options_.prefix_extractor.get() == nullptr); + } + + // No copying allowed + explicit PlainTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; +} // namespace rocksdb diff --git a/table/table_builder.h b/table/table_builder.h new file mode 100644 index 00000000..ee32cff8 --- /dev/null +++ b/table/table_builder.h @@ -0,0 +1,55 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +namespace rocksdb { + +class Slice; +class Status; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. +class TableBuilder { + public: + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; +}; + +} // namespace rocksdb diff --git a/table/table_properties.cc b/table/table_properties.cc new file mode 100644 index 00000000..de9eb947 --- /dev/null +++ b/table/table_properties.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/table_properties.h" + +namespace rocksdb { + +namespace { + void AppendProperty( + std::string& props, + const std::string& key, + const std::string& value, + const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); + } + + template + void AppendProperty( + std::string& props, + const std::string& key, + const TValue& value, + const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty( + props, key, std::to_string(value), prop_delim, kv_delim + ); + } +} + +std::string TableProperties::ToString( + const std::string& prop_delim, + const std::string& kv_delim) const { + std::string result; + result.reserve(1024); + + // Basic Info + AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, + kv_delim); + AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + + AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); + AppendProperty(result, "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, kv_delim); + AppendProperty(result, "raw value size", raw_value_size, prop_delim, + kv_delim); + AppendProperty(result, "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, kv_delim); + + AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); + AppendProperty(result, "filter block size", filter_size, prop_delim, + kv_delim); + AppendProperty(result, "(estimated) table size", + data_size + index_size + filter_size, prop_delim, kv_delim); + + AppendProperty( + result, "filter policy name", + filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, + prop_delim, kv_delim); + + return result; +} + +const std::string TablePropertiesNames::kDataSize = + "rocksdb.data.size"; +const std::string TablePropertiesNames::kIndexSize = + "rocksdb.index.size"; +const std::string TablePropertiesNames::kFilterSize = + "rocksdb.filter.size"; +const std::string TablePropertiesNames::kRawKeySize = + "rocksdb.raw.key.size"; +const std::string TablePropertiesNames::kRawValueSize = + "rocksdb.raw.value.size"; +const std::string TablePropertiesNames::kNumDataBlocks = + "rocksdb.num.data.blocks"; +const std::string TablePropertiesNames::kNumEntries = + "rocksdb.num.entries"; +const std::string TablePropertiesNames::kFilterPolicy = + "rocksdb.filter.policy"; +const std::string TablePropertiesNames::kFormatVersion = + "rocksdb.format.version"; +const std::string TablePropertiesNames::kFixedKeyLen = + "rocksdb.fixed.key.length"; + +extern const std::string kPropertiesBlock = "rocksdb.properties"; +// Old property block name for backward compatibility +extern const std::string kPropertiesBlockOldName = "rocksdb.stats"; + +} // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h new file mode 100644 index 00000000..3d2738c9 --- /dev/null +++ b/table/table_reader.h @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +namespace rocksdb { + +class Iterator; +struct ParsedInternalKey; +class Slice; +struct ReadOptions; +struct TableProperties; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class TableReader { + public: + virtual ~TableReader() {} + + // Determine whether there is a chance that the current table file + // contains the key a key starting with iternal_prefix. The specific + // table implementation can use bloom filter and/or other heuristic + // to filter out this table as a whole. + virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + virtual Iterator* NewIterator(const ReadOptions&) = 0; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual std::shared_ptr GetTableProperties() const = 0; + + // Calls (*result_handler)(handle_context, ...) repeatedly, starting with + // the entry found after a call to Seek(key), until result_handler returns + // false, where k is the actual internal key for a row found and v as the + // value of the key. didIO is true if I/O is involved in the operation. May + // not make such a call if filter policy says that key is not present. + // + // mark_key_may_exist_handler needs to be called when it is configured to be + // memory only and the key is not found in the block cache, with + // the parameter to be handle_context. + // + // readOptions is the options for the read + // key is the key to search for + virtual Status Get( + const ReadOptions& readOptions, const Slice& key, void* handle_context, + bool (*result_handler)(void* arg, const ParsedInternalKey& k, + const Slice& v, bool didIO), + void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; +}; + +} // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc new file mode 100644 index 00000000..ab86521f --- /dev/null +++ b/table/table_reader_bench.cc @@ -0,0 +1,273 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "port/atomic_pointer.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "table/table_builder.h" +#include "util/histogram.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { +// Make a key that i determines the first 4 characters and j determines the +// last 4 characters. +static std::string MakeKey(int i, int j, bool through_db) { + char buf[100]; + snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j); + if (through_db) { + return std::string(buf); + } + // If we directly query table, which operates on internal keys + // instead of user keys, we need to add 8 bytes of internal + // information (row type etc) to user key to make an internal + // key. + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, + const Slice& v, bool didIO) { + return false; +} + +uint64_t Now(Env* env, bool measured_by_nanosecond) { + return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); +} + +// A very simple benchmark that. +// Create a table with roughly numKey1 * numKey2 keys, +// where there are numKey1 prefixes of the key, each has numKey2 number of +// distinguished key, differing in the suffix part. +// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2 +// times randomly. +// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys. +// Print out the total time. +// If through_db=true, a full DB will be created and queries will be against +// it. Otherwise, operations will be directly through table level. +// +// If for_terator=true, instead of just query one key each time, it queries +// a range sharing the same prefix. +void TableReaderBenchmark(Options& opts, EnvOptions& env_options, + ReadOptions& read_options, int num_keys1, + int num_keys2, int num_iter, int prefix_len, + bool if_query_empty_keys, bool for_iterator, + bool through_db, bool measured_by_nanosecond) { + rocksdb::InternalKeyComparator ikc(opts.comparator); + + Slice prefix = Slice(); + + std::string file_name = test::TmpDir() + + "/rocksdb_table_reader_benchmark"; + std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db"; + WriteOptions wo; + unique_ptr file; + Env* env = Env::Default(); + TableBuilder* tb = nullptr; + DB* db = nullptr; + Status s; + if (!through_db) { + env->NewWritableFile(file_name, &file, env_options); + tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), + CompressionType::kNoCompression); + } else { + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + } + // Populate slightly more than 1M keys + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + std::string key = MakeKey(i * 2, j, through_db); + if (!through_db) { + tb->Add(key, key); + } else { + db->Put(wo, key, key); + } + } + } + if (!through_db) { + tb->Finish(); + file->Close(); + } else { + db->Flush(FlushOptions()); + } + + unique_ptr table_reader; + unique_ptr raf; + if (!through_db) { + Status s = env->NewRandomAccessFile(file_name, &raf, env_options); + uint64_t file_size; + env->GetFileSize(file_name, &file_size); + s = opts.table_factory->NewTableReader( + opts, env_options, ikc, std::move(raf), file_size, &table_reader); + } + + Random rnd(301); + std::string result; + HistogramImpl hist; + + void* arg = nullptr; + for (int it = 0; it < num_iter; it++) { + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + int r1 = rnd.Uniform(num_keys1) * 2; + int r2 = rnd.Uniform(num_keys2); + if (if_query_empty_keys) { + r1++; + r2 = num_keys2 * 2 - r2; + } + + if (!for_iterator) { + // Query one existing key; + std::string key = MakeKey(r1, r2, through_db); + uint64_t start_time = Now(env, measured_by_nanosecond); + port::MemoryBarrier(); + if (!through_db) { + s = table_reader->Get(read_options, key, arg, DummySaveValue, + nullptr); + } else { + s = db->Get(read_options, key, &result); + } + port::MemoryBarrier(); + hist.Add(Now(env, measured_by_nanosecond) - start_time); + } else { + int r2_len; + if (if_query_empty_keys) { + r2_len = 0; + } else { + r2_len = rnd.Uniform(num_keys2) + 1; + if (r2_len + r2 > num_keys2) { + r2_len = num_keys2 - r2; + } + } + std::string start_key = MakeKey(r1, r2, through_db); + std::string end_key = MakeKey(r1, r2 + r2_len, through_db); + if (prefix_len < 16) { + prefix = Slice(start_key.data(), prefix_len); + read_options.prefix = &prefix; + } + uint64_t total_time = 0; + uint64_t start_time = Now(env, measured_by_nanosecond); + port::MemoryBarrier(); + Iterator* iter; + if (!through_db) { + iter = table_reader->NewIterator(read_options); + } else { + iter = db->NewIterator(read_options); + } + int count = 0; + for(iter->Seek(start_key); iter->Valid(); iter->Next()) { + if (if_query_empty_keys) { + break; + } + // verify key; + port::MemoryBarrier(); + total_time += Now(env, measured_by_nanosecond) - start_time; + assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key()); + start_time = Now(env, measured_by_nanosecond); + if (++count >= r2_len) { + break; + } + } + if (count != r2_len) { + fprintf( + stderr, "Iterator cannot iterate expected number of entries. " + "Expected %d but got %d\n", r2_len, count); + assert(false); + } + delete iter; + port::MemoryBarrier(); + total_time += Now(env, measured_by_nanosecond) - start_time; + hist.Add(total_time); + } + } + } + } + + fprintf( + stderr, + "===================================================" + "====================================================\n" + "InMemoryTableSimpleBenchmark: %20s num_key1: %5d " + "num_key2: %5d %10s\n" + "===================================================" + "====================================================" + "\nHistogram (unit: %s): \n%s", + opts.table_factory->Name(), num_keys1, num_keys2, + for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"), + measured_by_nanosecond ? "nanosecond" : "microsecond", + hist.ToString().c_str()); + if (!through_db) { + env->DeleteFile(file_name); + } else { + delete db; + db = nullptr; + DestroyDB(dbname, opts); + } +} +} // namespace rocksdb + +DEFINE_bool(query_empty, false, "query non-existing keys instead of existing " + "ones."); +DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys"); +DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix"); +DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones"); +DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes"); +DEFINE_bool(iterator, false, "For test iterator"); +DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " + "the query will be against DB. Otherwise, will be directly against " + "a table reader."); +DEFINE_bool(plain_table, false, "Use PlainTable"); +DEFINE_string(time_unit, "microsecond", + "The time unit used for measuring performance. User can specify " + "`microsecond` (default) or `nanosecond`"); + +int main(int argc, char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory(); + rocksdb::Options options; + if (FLAGS_prefix_len < 16) { + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len)); + } + rocksdb::ReadOptions ro; + rocksdb::EnvOptions env_options; + options.create_if_missing = true; + options.compression = rocksdb::CompressionType::kNoCompression; + + if (FLAGS_plain_table) { + ro.prefix_seek = true; + options.allow_mmap_reads = true; + env_options.use_mmap_reads = true; + tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8, + 0.75); + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len)); + } else { + tf = new rocksdb::BlockBasedTableFactory(); + } + // if user provides invalid options, just fall back to microsecond. + bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond"; + + options.table_factory = + std::shared_ptr(tf); + TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, + FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, + FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db, + measured_by_nanosecond); + delete tf; + return 0; +} diff --git a/table/table_test.cc b/table/table_test.cc new file mode 100644 index 00000000..c7ddf3f8 --- /dev/null +++ b/table/table_test.cc @@ -0,0 +1,1729 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" + +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" + +#include "table/block.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_factory.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" + +#include "util/random.h" +#include "util/statistics.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +namespace { + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +std::string Reverse(const Slice& key) { + auto rev = key.ToString(); + std::reverse(rev.begin(), rev.end()); + return rev; +} + +class ReverseKeyComparator : public Comparator { + public: + virtual const char* Name() const { + return "rocksdb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + virtual void FindShortSuccessor(std::string* key) const { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; + +ReverseKeyComparator reverse_key_comparator; + +void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +// An STL comparator that uses a Comparator +struct STLLessThan { + const Comparator* cmp; + + STLLessThan() : cmp(BytewiseComparator()) { } + explicit STLLessThan(const Comparator* c) : cmp(c) { } + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } +}; + +} // namespace + +class StringSink: public WritableFile { + public: + ~StringSink() { } + + const std::string& contents() const { return contents_; } + + virtual Status Close() { return Status::OK(); } + virtual Status Flush() { return Status::OK(); } + virtual Status Sync() { return Status::OK(); } + + virtual Status Append(const Slice& data) { + contents_.append(data.data(), data.size()); + return Status::OK(); + } + + private: + std::string contents_; +}; + + +class StringSource: public RandomAccessFile { + public: + StringSource(const Slice& contents, uint64_t uniq_id, bool mmap) + : contents_(contents.data(), contents.size()), uniq_id_(uniq_id), + mmap_(mmap) { + } + + virtual ~StringSource() { } + + uint64_t Size() const { return contents_.size(); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (offset > contents_.size()) { + return Status::InvalidArgument("invalid Read offset"); + } + if (offset + n > contents_.size()) { + n = contents_.size() - offset; + } + if (!mmap_) { + memcpy(scratch, &contents_[offset], n); + *result = Slice(scratch, n); + } else { + *result = Slice(&contents_[offset], n); + } + return Status::OK(); + } + + virtual size_t GetUniqueId(char* id, size_t max_size) const { + if (max_size < 20) { + return 0; + } + + char* rid = id; + rid = EncodeVarint64(rid, uniq_id_); + rid = EncodeVarint64(rid, 0); + return static_cast(rid-id); + } + + private: + std::string contents_; + uint64_t uniq_id_; + bool mmap_; +}; + +typedef std::map KVMap; + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {} + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, + const InternalKeyComparator& internal_comparator, + std::vector* keys, KVMap* kvmap) { + last_internal_key_ = &internal_comparator; + *kvmap = data_; + keys->clear(); + for (KVMap::const_iterator it = data_.begin(); + it != data_.end(); + ++it) { + keys->push_back(it->first); + } + data_.clear(); + Status s = FinishImpl(options, internal_comparator, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) = 0; + + virtual Iterator* NewIterator() const = 0; + + virtual const KVMap& data() { return data_; } + + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + + protected: + const InternalKeyComparator* last_internal_key_; + + private: + KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_(nullptr) { } + ~BlockConstructor() { + delete block_; + } + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + delete block_; + block_ = nullptr; + BlockBuilder builder(options, &internal_comparator); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + builder.Add(it->first, it->second); + } + // Open the block + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + contents.cachable = false; + contents.heap_allocated = false; + block_ = new Block(contents); + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return block_->NewIterator(comparator_); + } + + private: + const Comparator* comparator_; + std::string data_; + Block* block_; + + BlockConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator: public Iterator { + public: + explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } + virtual ~KeyConvertingIterator() { delete iter_; } + virtual bool Valid() const { return iter_->Valid(); } + virtual void Seek(const Slice& target) { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + virtual void SeekToFirst() { iter_->SeekToFirst(); } + virtual void SeekToLast() { iter_->SeekToLast(); } + virtual void Next() { iter_->Next(); } + virtual void Prev() { iter_->Prev(); } + + virtual Slice key() const { + assert(Valid()); + ParsedInternalKey key; + if (!ParseInternalKey(iter_->key(), &key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return key.user_key; + } + + virtual Slice value() const { return iter_->value(); } + virtual Status status() const { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + Iterator* iter_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class TableConstructor: public Constructor { + public: + explicit TableConstructor(const Comparator* cmp, + bool convert_to_internal_key = false, + bool prefix_seek = false) + : Constructor(cmp), + convert_to_internal_key_(convert_to_internal_key), + prefix_seek_(prefix_seek) {} + ~TableConstructor() { Reset(); } + + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + Reset(); + sink_.reset(new StringSink()); + unique_ptr builder; + builder.reset(options.table_factory->NewTableBuilder( + options, internal_comparator, sink_.get(), options.compression)); + + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + if (convert_to_internal_key_) { + ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + builder->Add(encoded, it->second); + } else { + builder->Add(it->first, it->second); + } + ASSERT_TRUE(builder->status().ok()); + } + Status s = builder->Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + + ASSERT_EQ(sink_->contents().size(), builder->FileSize()); + + // Open the table + uniq_id_ = cur_uniq_id_++; + source_.reset(new StringSource(sink_->contents(), uniq_id_, + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, internal_comparator, std::move(source_), + sink_->contents().size(), &table_reader_); + } + + virtual Iterator* NewIterator() const { + ReadOptions ro; + if (prefix_seek_) { + ro.prefix_seek = true; + } + Iterator* iter = table_reader_->NewIterator(ro); + if (convert_to_internal_key_) { + return new KeyConvertingIterator(iter); + } else { + return iter; + } + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + return table_reader_->ApproximateOffsetOf(key); + } + + virtual Status Reopen(const Options& options) { + source_.reset( + new StringSource(sink_->contents(), uniq_id_, + options.allow_mmap_reads)); + return options.table_factory->NewTableReader( + options, soptions, *last_internal_key_, std::move(source_), + sink_->contents().size(), &table_reader_); + } + + virtual TableReader* table_reader() { + return table_reader_.get(); + } + + private: + void Reset() { + uniq_id_ = 0; + table_reader_.reset(); + sink_.reset(); + source_.reset(); + } + bool convert_to_internal_key_; + bool prefix_seek_; + + uint64_t uniq_id_; + unique_ptr sink_; + unique_ptr source_; + unique_ptr table_reader_; + + TableConstructor(); + + static uint64_t cur_uniq_id_; + const EnvOptions soptions; +}; +uint64_t TableConstructor::cur_uniq_id_ = 1; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp) + : Constructor(cmp), + internal_comparator_(cmp), + table_factory_(new SkipListFactory) { + Options options; + options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, options); + memtable_->Ref(); + } + ~MemTableConstructor() { + delete memtable_->Unref(); + } + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + delete memtable_->Unref(); + Options memtable_options; + memtable_options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, memtable_options); + memtable_->Ref(); + int seq = 1; + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + memtable_->Add(seq, kTypeValue, it->first, it->second); + seq++; + } + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return new KeyConvertingIterator(memtable_->NewIterator()); + } + + private: + InternalKeyComparator internal_comparator_; + MemTable* memtable_; + std::shared_ptr table_factory_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = nullptr; + NewDB(); + } + ~DBConstructor() { + delete db_; + } + virtual Status FinishImpl(const Options& options, + const InternalKeyComparator& internal_comparator, + const KVMap& data) { + delete db_; + db_ = nullptr; + NewDB(); + for (KVMap::const_iterator it = data.begin(); + it != data.end(); + ++it) { + WriteBatch batch; + batch.Put(it->first, it->second); + ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + virtual Iterator* NewIterator() const { + return db_->NewIterator(ReadOptions()); + } + + virtual DB* db() const { return db_; } + + private: + void NewDB() { + std::string name = test::TmpDir() + "/table_testdb"; + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +static bool SnappyCompressionSupported() { +#ifdef SNAPPY + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Snappy_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool ZlibCompressionSupported() { +#ifdef ZLIB + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::Zlib_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool BZip2CompressionSupported() { +#ifdef BZIP2 + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::BZip2_Compress(Options().compression_opts, + in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool LZ4CompressionSupported() { +#ifdef LZ4 + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4_Compress(Options().compression_opts, in.data(), in.size(), + &out); +#else + return false; +#endif +} + +static bool LZ4HCCompressionSupported() { +#ifdef LZ4 + std::string out; + Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + return port::LZ4HC_Compress(Options().compression_opts, in.data(), in.size(), + &out); +#else + return false; +#endif +} + +enum TestType { + BLOCK_BASED_TABLE_TEST, + PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; + CompressionType compression; +}; + +static std::vector GenerateArgList() { + std::vector test_args; + std::vector test_types = { + BLOCK_BASED_TABLE_TEST, PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER, + BLOCK_TEST, MEMTABLE_TEST, + DB_TEST}; + std::vector reverse_compare_types = {false, true}; + std::vector restart_intervals = {16, 1, 1024}; + + // Only add compression if it is supported + std::vector compression_types; + compression_types.push_back(kNoCompression); + if (SnappyCompressionSupported()) { + compression_types.push_back(kSnappyCompression); + } + if (ZlibCompressionSupported()) { + compression_types.push_back(kZlibCompression); + } + if (BZip2CompressionSupported()) { + compression_types.push_back(kBZip2Compression); + } + if (LZ4CompressionSupported()) { + compression_types.push_back(kLZ4Compression); + } + if (LZ4HCCompressionSupported()) { + compression_types.push_back(kLZ4HCCompression); + } + + for (auto test_type : test_types) { + for (auto reverse_compare : reverse_compare_types) { + if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX || + test_type == PLAIN_TABLE_FULL_STR_PREFIX) { + // Plain table doesn't use restart index or compression. + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_intervals[0]; + one_arg.compression = compression_types[0]; + test_args.push_back(one_arg); + continue; + } + + for (auto restart_interval : restart_intervals) { + for (auto compression_type : compression_types) { + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_interval; + one_arg.compression = compression_type; + test_args.push_back(one_arg); + } + } + } + } + return test_args; +} + +// In order to make all tests run for plain table format, including +// those operating on empty keys, create a new prefix transformer which +// return fixed prefix if the slice is not shorter than the prefix length, +// and the full slice if it is shorter. +class FixedOrLessPrefixTransform : public SliceTransform { + private: + const size_t prefix_len_; + + public: + explicit FixedOrLessPrefixTransform(size_t prefix_len) : + prefix_len_(prefix_len) { + } + + virtual const char* Name() const { + return "rocksdb.FixedPrefix"; + } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + if (src.size() < prefix_len_) { + return src; + } + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return true; + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() <= prefix_len_); + } +}; + +class Harness { + public: + Harness() : constructor_(nullptr) { } + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = nullptr; + options_ = Options(); + + options_.block_restart_interval = args.restart_interval; + options_.compression = args.compression; + // Use shorter block size for tests to exercise block boundary + // conditions more. + options_.block_size = 256; + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + + internal_comparator_.reset( + new test::PlainInternalKeyComparator(options_.comparator)); + + support_prev_ = true; + only_support_prefix_seek_ = false; + BlockBasedTableOptions table_options; + switch (args.type) { + case BLOCK_BASED_TABLE_TEST: + table_options.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + options_.table_factory.reset(new BlockBasedTableFactory(table_options)); + constructor_ = new TableConstructor(options_.comparator); + break; + case PLAIN_TABLE_SEMI_FIXED_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2)); + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_FULL_STR_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor.reset(NewNoopTransform()); + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, true); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_TOTAL_ORDER: + support_prev_ = false; + only_support_prefix_seek_ = false; + options_.prefix_extractor = nullptr; + options_.allow_mmap_reads = true; + options_.table_factory.reset(NewTotalOrderPlainTableFactory()); + constructor_ = new TableConstructor(options_.comparator, true, false); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case BLOCK_TEST: + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + constructor_ = new MemTableConstructor(options_.comparator); + break; + case DB_TEST: + constructor_ = new DBConstructor(options_.comparator); + break; + } + } + + ~Harness() { + delete constructor_; + } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector keys; + KVMap data; + constructor_->Finish(options_, *internal_comparator_, &keys, &data); + + TestForwardScan(keys, data); + if (support_prev_) { + TestBackwardScan(keys, data); + } + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestBackwardScan(const std::vector& keys, + const KVMap& data) { + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); + ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + delete iter; + } + + void TestRandomAccess(Random* rnd, + const std::vector& keys, + const KVMap& data) { + static const bool kVerbose = false; + Iterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(support_prev_ ? 5 : 3); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + delete iter; + } + + std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const KVMap& data, + const KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const Iterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(keys.size()); + std::string result = keys[index]; + switch (rnd->Uniform(support_prev_ ? 3 : 1)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size() - 1] > '\0' + && (!only_support_prefix_seek_ + || options_.prefix_extractor->Transform(result).size() + < result.size())) { + result[result.size() - 1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns nullptr if not running against a DB + DB* db() const { return constructor_->db(); } + + private: + Options options_ = Options(); + Constructor* constructor_; + bool support_prev_; + bool only_support_prefix_seek_; + shared_ptr internal_comparator_; +}; + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +// Tests against all kinds of tables +class TableTest { + public: + const InternalKeyComparator& GetPlainInternalComparator( + const Comparator* comp) { + if (!plain_internal_comparator) { + plain_internal_comparator.reset( + new test::PlainInternalKeyComparator(comp)); + } + return *plain_internal_comparator; + } + + private: + std::unique_ptr plain_internal_comparator; +}; + +class GeneralTableTest : public TableTest {}; +class BlockBasedTableTest : public TableTest {}; +class PlainTableTest : public TableTest {}; +class TablePropertyTest {}; + +// This test serves as the living tutorial for the prefix scan of user collected +// properties. +TEST(TablePropertyTest, PrefixScanTest) { + UserCollectedProperties props{{"num.111.1", "1"}, + {"num.111.2", "2"}, + {"num.111.3", "3"}, + {"num.333.1", "1"}, + {"num.333.2", "2"}, + {"num.333.3", "3"}, + {"num.555.1", "1"}, + {"num.555.2", "2"}, + {"num.555.3", "3"}, }; + + // prefixes that exist + for (const std::string& prefix : {"num.111", "num.333", "num.555"}) { + int num = 0; + for (auto pos = props.lower_bound(prefix); + pos != props.end() && + pos->first.compare(0, prefix.size(), prefix) == 0; + ++pos) { + ++num; + auto key = prefix + "." + std::to_string(num); + ASSERT_EQ(key, pos->first); + ASSERT_EQ(std::to_string(num), pos->second); + } + ASSERT_EQ(3, num); + } + + // prefixes that don't exist + for (const std::string& prefix : + {"num.000", "num.222", "num.444", "num.666"}) { + auto pos = props.lower_bound(prefix); + ASSERT_TRUE(pos == props.end() || + pos->first.compare(0, prefix.size(), prefix) != 0); + } +} + +// This test include all the basic checks except those for index size and block +// size, which will be conducted in separated unit tests. +TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { + TableConstructor c(BytewiseComparator()); + + c.Add("a1", "val1"); + c.Add("b2", "val2"); + c.Add("c3", "val3"); + c.Add("d4", "val4"); + c.Add("e5", "val5"); + c.Add("f6", "val6"); + c.Add("g7", "val7"); + c.Add("h8", "val8"); + c.Add("j9", "val9"); + + std::vector keys; + KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + + auto& props = *c.table_reader()->GetTableProperties(); + ASSERT_EQ(kvmap.size(), props.num_entries); + + auto raw_key_size = kvmap.size() * 2ul; + auto raw_value_size = kvmap.size() * 4ul; + + ASSERT_EQ(raw_key_size, props.raw_key_size); + ASSERT_EQ(raw_value_size, props.raw_value_size); + ASSERT_EQ(1ul, props.num_data_blocks); + ASSERT_EQ("", props.filter_policy_name); // no filter policy is used + + // Verify data size. + BlockBuilder block_builder(options, options.comparator); + for (const auto& item : kvmap) { + block_builder.Add(item.first, item.second); + } + Slice content = block_builder.Finish(); + ASSERT_EQ(content.size() + kBlockTrailerSize, props.data_size); +} + +TEST(BlockBasedTableTest, FilterPolicyNameProperties) { + TableConstructor c(BytewiseComparator()); + c.Add("a1", "val1"); + std::vector keys; + KVMap kvmap; + Options options; + std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); + options.filter_policy = filter_policy.get(); + + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + auto& props = *c.table_reader()->GetTableProperties(); + ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +void AddInternalKey(TableConstructor* c, const std::string prefix, + int suffix_len = 800) { + static Random rnd(1023); + InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); + c->Add(k.Encode().ToString(), "v"); +} + +TEST(TableTest, HashIndexTest) { + TableConstructor c(BytewiseComparator()); + + // keys with prefix length 3, make sure the key/value is big enough to fill + // one block + AddInternalKey(&c, "0015"); + AddInternalKey(&c, "0035"); + + AddInternalKey(&c, "0054"); + AddInternalKey(&c, "0055"); + + AddInternalKey(&c, "0056"); + AddInternalKey(&c, "0057"); + + AddInternalKey(&c, "0058"); + AddInternalKey(&c, "0075"); + + AddInternalKey(&c, "0076"); + AddInternalKey(&c, "0095"); + + std::vector keys; + KVMap kvmap; + Options options; + BlockBasedTableOptions table_options; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.block_cache = NewLRUCache(1024); + options.block_size = 1700; + + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + c.Finish(options, *comparator, &keys, &kvmap); + auto reader = c.table_reader(); + + auto props = c.table_reader()->GetTableProperties(); + ASSERT_EQ(5u, props->num_data_blocks); + + std::unique_ptr hash_iter(reader->NewIterator(ReadOptions())); + + // -- Find keys do not exist, but have common prefix. + std::vector prefixes = {"001", "003", "005", "007", "009"}; + std::vector lower_bound = {keys[0], keys[1], keys[2], + keys[7], keys[9], }; + + // find the lower bound of the prefix + for (size_t i = 0; i < prefixes.size(); ++i) { + hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode()); + ASSERT_OK(hash_iter->status()); + ASSERT_TRUE(hash_iter->Valid()); + + // seek the first element in the block + ASSERT_EQ(lower_bound[i], hash_iter->key().ToString()); + ASSERT_EQ("v", hash_iter->value().ToString()); + } + + // find the upper bound of prefixes + std::vector upper_bound = {keys[1], keys[2], keys[7], keys[9], }; + + // find existing keys + for (const auto& item : kvmap) { + auto ukey = ExtractUserKey(item.first).ToString(); + hash_iter->Seek(ukey); + + // ASSERT_OK(regular_iter->status()); + ASSERT_OK(hash_iter->status()); + + // ASSERT_TRUE(regular_iter->Valid()); + ASSERT_TRUE(hash_iter->Valid()); + + ASSERT_EQ(item.first, hash_iter->key().ToString()); + ASSERT_EQ(item.second, hash_iter->value().ToString()); + } + + for (size_t i = 0; i < prefixes.size(); ++i) { + // the key is greater than any existing keys. + auto key = prefixes[i] + "9"; + hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode()); + + ASSERT_OK(hash_iter->status()); + if (i == prefixes.size() - 1) { + // last key + ASSERT_TRUE(!hash_iter->Valid()); + } else { + ASSERT_TRUE(hash_iter->Valid()); + // seek the first element in the block + ASSERT_EQ(upper_bound[i], hash_iter->key().ToString()); + ASSERT_EQ("v", hash_iter->value().ToString()); + } + } + + // find keys with prefix that don't match any of the existing prefixes. + std::vector non_exist_prefixes = {"002", "004", "006", "008"}; + for (const auto& prefix : non_exist_prefixes) { + hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode()); + // regular_iter->Seek(prefix); + + ASSERT_OK(hash_iter->status()); + ASSERT_TRUE(!hash_iter->Valid()); + } +} + +// It's very hard to figure out the index block size of a block accurately. +// To make sure we get the index size, we just make sure as key number +// grows, the filter block size also grows. +TEST(BlockBasedTableTest, IndexSizeStat) { + uint64_t last_index_size = 0; + + // we need to use random keys since the pure human readable texts + // may be well compressed, resulting insignifcant change of index + // block size. + Random rnd(test::RandomSeed()); + std::vector keys; + + for (int i = 0; i < 100; ++i) { + keys.push_back(RandomString(&rnd, 10000)); + } + + // Each time we load one more key to the table. the table index block + // size is expected to be larger than last time's. + for (size_t i = 1; i < keys.size(); ++i) { + TableConstructor c(BytewiseComparator()); + for (size_t j = 0; j < i; ++j) { + c.Add(keys[j], "val"); + } + + std::vector ks; + KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + + c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, + &kvmap); + auto index_size = c.table_reader()->GetTableProperties()->index_size; + ASSERT_GT(index_size, last_index_size); + last_index_size = index_size; + } +} + +TEST(BlockBasedTableTest, NumBlockStat) { + Random rnd(test::RandomSeed()); + TableConstructor c(BytewiseComparator()); + Options options; + options.compression = kNoCompression; + options.block_restart_interval = 1; + options.block_size = 1000; + + for (int i = 0; i < 10; ++i) { + // the key/val are slightly smaller than block size, so that each block + // holds roughly one key/value pair. + c.Add(RandomString(&rnd, 900), "val"); + } + + std::vector ks; + KVMap kvmap; + c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, + &kvmap); + ASSERT_EQ(kvmap.size(), + c.table_reader()->GetTableProperties()->num_data_blocks); +} + +// A simple tool that takes the snapshot of block cache statistics. +class BlockCachePropertiesSnapshot { + public: + explicit BlockCachePropertiesSnapshot(Statistics* statistics) { + block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS); + block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT); + index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); + index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT); + data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS); + data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT); + filter_block_cache_miss = + statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS); + filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT); + } + + void AssertIndexBlockStat(int64_t index_block_cache_miss, + int64_t index_block_cache_hit) { + ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); + ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); + } + + void AssertFilterBlockStat(int64_t filter_block_cache_miss, + int64_t filter_block_cache_hit) { + ASSERT_EQ(filter_block_cache_miss, this->filter_block_cache_miss); + ASSERT_EQ(filter_block_cache_hit, this->filter_block_cache_hit); + } + + // Check if the fetched props matches the expected ones. + // TODO(kailiu) Use this only when you disabled filter policy! + void AssertEqual(int64_t index_block_cache_miss, + int64_t index_block_cache_hit, int64_t data_block_cache_miss, + int64_t data_block_cache_hit) const { + ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); + ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); + ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss); + ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit); + ASSERT_EQ(index_block_cache_miss + data_block_cache_miss, + this->block_cache_miss); + ASSERT_EQ(index_block_cache_hit + data_block_cache_hit, + this->block_cache_hit); + } + + private: + int64_t block_cache_miss = 0; + int64_t block_cache_hit = 0; + int64_t index_block_cache_miss = 0; + int64_t index_block_cache_hit = 0; + int64_t data_block_cache_miss = 0; + int64_t data_block_cache_hit = 0; + int64_t filter_block_cache_miss = 0; + int64_t filter_block_cache_hit = 0; +}; + +// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't +// use block cache to store them). +TEST(BlockBasedTableTest, BlockCacheDisabledTest) { + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.block_cache = NewLRUCache(1024); + std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); + options.filter_policy = filter_policy.get(); + BlockBasedTableOptions table_options; + // Intentionally commented out: table_options.cache_index_and_filter_blocks = + // true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector keys; + KVMap kvmap; + + TableConstructor c(BytewiseComparator()); + c.Add("key", "value"); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + + // preloading filter/index blocks is enabled. + auto reader = dynamic_cast(c.table_reader()); + ASSERT_TRUE(reader->TEST_filter_block_preloaded()); + ASSERT_TRUE(reader->TEST_index_reader_preloaded()); + + { + // nothing happens in the beginning + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertIndexBlockStat(0, 0); + props.AssertFilterBlockStat(0, 0); + } + + { + // a hack that just to trigger BlockBasedTable::GetFilter. + reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertIndexBlockStat(0, 0); + props.AssertFilterBlockStat(0, 0); + } +} + +// Due to the difficulities of the intersaction between statistics, this test +// only tests the case when "index block is put to block cache" +TEST(BlockBasedTableTest, FilterBlockInBlockCache) { + // -- Table construction + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.block_cache = NewLRUCache(1024); + + // Enable the cache for index/filter blocks + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector keys; + KVMap kvmap; + + TableConstructor c(BytewiseComparator()); + c.Add("key", "value"); + c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, + &kvmap); + // preloading filter/index blocks is prohibited. + auto reader = dynamic_cast(c.table_reader()); + ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); + + // -- PART 1: Open with regular block cache. + // Since block_cache is disabled, no cache activities will be involved. + unique_ptr iter; + + // At first, no block will be accessed. + { + BlockCachePropertiesSnapshot props(options.statistics.get()); + // index will be added to block cache. + props.AssertEqual(1, // index block miss + 0, 0, 0); + } + + // Only index block will be accessed + { + iter.reset(c.NewIterator()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + // NOTE: to help better highlight the "detla" of each ticker, I use + // + to indicate the increment of changed + // value; other numbers remain the same. + props.AssertEqual(1, 0 + 1, // index block hit + 0, 0); + } + + // Only data block will be accessed + { + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, 1, 0 + 1, // data block miss + 0); + } + + // Data block will be in cache + { + iter.reset(c.NewIterator()); + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, 1 + 1, /* index block hit */ + 1, 0 + 1 /* data block hit */); + } + // release the iterator so that the block cache can reset correctly. + iter.reset(); + + // -- PART 2: Open without block cache + options.block_cache.reset(); + options.statistics = CreateDBStatistics(); // reset the stats + c.Reopen(options); + + { + iter.reset(c.NewIterator()); + iter->SeekToFirst(); + ASSERT_EQ("key", iter->key().ToString()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + // Nothing is affected at all + props.AssertEqual(0, 0, 0, 0); + } + + // -- PART 3: Open with very small block cache + // In this test, no block will ever get hit since the block cache is + // too small to fit even one entry. + options.block_cache = NewLRUCache(1); + c.Reopen(options); + { + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, // index block miss + 0, 0, 0); + } + + + { + // Both index and data block get accessed. + // It first cache index block then data block. But since the cache size + // is only 1, index block will be purged after data block is inserted. + iter.reset(c.NewIterator()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1 + 1, // index block miss + 0, 0, // data block miss + 0); + } + + { + // SeekToFirst() accesses data block. With similar reason, we expect data + // block's cache miss. + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(2, 0, 0 + 1, // data block miss + 0); + } +} + +TEST(BlockBasedTableTest, BlockCacheLeak) { + // Check that when we reopen a table we don't lose access to blocks already + // in the cache. This test checks whether the Table actually makes use of the + // unique ID from the file. + + Options opt; + unique_ptr ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.block_size = 1024; + opt.compression = kNoCompression; + opt.block_cache = + NewLRUCache(16 * 1024 * 1024); // big enough so we don't ever + // lose cached values. + + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + c.Finish(opt, *ikc, &keys, &kvmap); + + unique_ptr iter(c.NewIterator()); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + + ASSERT_OK(c.Reopen(opt)); + auto table_reader = dynamic_cast(c.table_reader()); + for (const std::string& key : keys) { + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); + } +} + +extern const uint64_t kPlainTableMagicNumber; +TEST(PlainTableTest, BasicPlainTableProperties) { + PlainTableFactory factory(8, 8, 0); + StringSink sink; + Options options; + InternalKeyComparator ikc(options.comparator); + std::unique_ptr builder( + factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); + + for (char c = 'a'; c <= 'z'; ++c) { + std::string key(8, c); + key.append("\1 "); // PlainTable expects internal key structure + std::string value(28, c + 42); + builder->Add(key, value); + } + ASSERT_OK(builder->Finish()); + + StringSource source(sink.contents(), 72242, true); + + TableProperties* props = nullptr; + auto s = ReadTableProperties(&source, sink.contents().size(), + kPlainTableMagicNumber, Env::Default(), nullptr, + &props); + std::unique_ptr props_guard(props); + ASSERT_OK(s); + + ASSERT_EQ(0ul, props->index_size); + ASSERT_EQ(0ul, props->filter_size); + ASSERT_EQ(16ul * 26, props->raw_key_size); + ASSERT_EQ(28ul * 26, props->raw_value_size); + ASSERT_EQ(26ul, props->num_entries); + ASSERT_EQ(1ul, props->num_data_blocks); +} + +TEST(GeneralTableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator()); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector keys; + KVMap kvmap; + Options options; + test::PlainInternalKeyComparator internal_comparator(options.comparator); + options.block_size = 1024; + options.compression = kNoCompression; + c.Finish(options, internal_comparator, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); +} + +static void DoCompressionTest(CompressionType comp) { + Random rnd(301); + TableConstructor c(BytewiseComparator()); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector keys; + KVMap kvmap; + Options options; + test::PlainInternalKeyComparator ikc(options.comparator); + options.block_size = 1024; + options.compression = comp; + c.Finish(options, ikc, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6100)); +} + +TEST(GeneralTableTest, ApproximateOffsetOfCompressed) { + std::vector compression_state; + if (!SnappyCompressionSupported()) { + fprintf(stderr, "skipping snappy compression tests\n"); + } else { + compression_state.push_back(kSnappyCompression); + } + + if (!ZlibCompressionSupported()) { + fprintf(stderr, "skipping zlib compression tests\n"); + } else { + compression_state.push_back(kZlibCompression); + } + + // TODO(kailiu) DoCompressionTest() doesn't work with BZip2. + /* + if (!BZip2CompressionSupported()) { + fprintf(stderr, "skipping bzip2 compression tests\n"); + } else { + compression_state.push_back(kBZip2Compression); + } + */ + + if (!LZ4CompressionSupported()) { + fprintf(stderr, "skipping lz4 compression tests\n"); + } else { + compression_state.push_back(kLZ4Compression); + } + + if (!LZ4HCCompressionSupported()) { + fprintf(stderr, "skipping lz4hc compression tests\n"); + } else { + compression_state.push_back(kLZ4HCCompression); + } + + for (auto state : compression_state) { + DoCompressionTest(state); + } +} + +TEST(Harness, Randomized) { + std::vector args = GenerateArgList(); + for (unsigned int i = 0; i < args.size(); i++) { + Init(args[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + if ((num_entries % 10) == 0) { + fprintf(stderr, "case %d of %d: num_entries = %d\n", (i + 1), + static_cast(args.size()), num_entries); + } + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } +} + +TEST(Harness, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = { DB_TEST, false, 16, kNoCompression }; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + int files = 0; + for (int level = 0; level < db()->NumberLevels(); level++) { + std::string value; + char name[100]; + snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level); + ASSERT_TRUE(db()->GetProperty(name, &value)); + files += atoi(value.c_str()); + } + ASSERT_GT(files, 0); +} + +class MemTableTest { }; + +TEST(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + auto table_factory = std::make_shared(); + Options options; + options.memtable_factory = table_factory; + MemTable* memtable = new MemTable(cmp, options); + memtable->Ref(); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok()); + + Iterator* iter = memtable->NewIterator(); + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", + iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + + delete iter; + delete memtable->Unref(); +} + +// Test the empty key +TEST(Harness, SimpleEmptyKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSingle) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST(Harness, SimpleMulti) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST(Harness, SimpleSpecialKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc new file mode 100644 index 00000000..65a58ad9 --- /dev/null +++ b/table/two_level_iterator.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" + +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +namespace { + +typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, + const EnvOptions& soptions, + const InternalKeyComparator& icomparator, + const Slice&, bool for_compaction); + +class TwoLevelIterator: public Iterator { + public: + TwoLevelIterator(Iterator* index_iter, BlockFunction block_function, + void* arg, const ReadOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + bool for_compaction); + + virtual ~TwoLevelIterator(); + + virtual void Seek(const Slice& target); + virtual void SeekToFirst(); + virtual void SeekToLast(); + virtual void Next(); + virtual void Prev(); + + virtual bool Valid() const { + return data_iter_.Valid(); + } + virtual Slice key() const { + assert(Valid()); + return data_iter_.key(); + } + virtual Slice value() const { + assert(Valid()); + return data_iter_.value(); + } + virtual Status status() const { + // It'd be nice if status() returned a const Status& instead of a Status + if (!index_iter_.status().ok()) { + return index_iter_.status(); + } else if (data_iter_.iter() != nullptr && !data_iter_.status().ok()) { + return data_iter_.status(); + } else { + return status_; + } + } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetDataIterator(Iterator* data_iter); + void InitDataBlock(); + + BlockFunction block_function_; + void* arg_; + const ReadOptions options_; + const EnvOptions& soptions_; + const InternalKeyComparator& internal_comparator_; + Status status_; + IteratorWrapper index_iter_; + IteratorWrapper data_iter_; // May be nullptr + // If data_iter_ is non-nullptr, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the data_iter_. + std::string data_block_handle_; + bool for_compaction_; +}; + +TwoLevelIterator::TwoLevelIterator( + Iterator* index_iter, BlockFunction block_function, void* arg, + const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, bool for_compaction) + : block_function_(block_function), + arg_(arg), + options_(options), + soptions_(soptions), + internal_comparator_(internal_comparator), + index_iter_(index_iter), + data_iter_(nullptr), + for_compaction_(for_compaction) {} + +TwoLevelIterator::~TwoLevelIterator() { +} + +void TwoLevelIterator::Seek(const Slice& target) { + index_iter_.Seek(target); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.Seek(target); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToFirst() { + index_iter_.SeekToFirst(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::SeekToLast() { + index_iter_.SeekToLast(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToLast(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIterator::Next() { + assert(Valid()); + data_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIterator::Prev() { + assert(Valid()); + data_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + + +void TwoLevelIterator::SkipEmptyDataBlocksForward() { + while (data_iter_.iter() == nullptr || (!data_iter_.Valid() && + !data_iter_.status().IsIncomplete())) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(nullptr); + return; + } + index_iter_.Next(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToFirst(); + } +} + +void TwoLevelIterator::SkipEmptyDataBlocksBackward() { + while (data_iter_.iter() == nullptr || (!data_iter_.Valid() && + !data_iter_.status().IsIncomplete())) { + // Move to next block + if (!index_iter_.Valid()) { + SetDataIterator(nullptr); + return; + } + index_iter_.Prev(); + InitDataBlock(); + if (data_iter_.iter() != nullptr) data_iter_.SeekToLast(); + } +} + +void TwoLevelIterator::SetDataIterator(Iterator* data_iter) { + if (data_iter_.iter() != nullptr) SaveError(data_iter_.status()); + data_iter_.Set(data_iter); +} + +void TwoLevelIterator::InitDataBlock() { + if (!index_iter_.Valid()) { + SetDataIterator(nullptr); + } else { + Slice handle = index_iter_.value(); + if (data_iter_.iter() != nullptr + && handle.compare(data_block_handle_) == 0) { + // data_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + Iterator* iter = + (*block_function_)(arg_, options_, soptions_, internal_comparator_, + handle, for_compaction_); + data_block_handle_.assign(handle.data(), handle.size()); + SetDataIterator(iter); + } + } +} + +} // namespace + +Iterator* NewTwoLevelIterator(Iterator* index_iter, + BlockFunction block_function, void* arg, + const ReadOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + bool for_compaction) { + return new TwoLevelIterator(index_iter, block_function, arg, options, + soptions, internal_comparator, for_compaction); +} + +} // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h new file mode 100644 index 00000000..d313dcb1 --- /dev/null +++ b/table/two_level_iterator.h @@ -0,0 +1,38 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/iterator.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +struct ReadOptions; +class InternalKeyComparator; + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +extern Iterator* NewTwoLevelIterator( + Iterator* index_iter, + Iterator* (*block_function)( + void* arg, const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + const Slice& index_value, bool for_compaction), + void* arg, const ReadOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + bool for_compaction = false); + +} // namespace rocksdb diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh new file mode 100755 index 00000000..2d63c0a8 --- /dev/null +++ b/tools/auto_sanity_test.sh @@ -0,0 +1,71 @@ +TMP_DIR="/tmp/rocksdb-sanity-test" + +if [ "$#" -lt 2 ]; then + echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]" + echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits." + recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'` + commit_new=`echo "$recent_commits" | head -n1` + commit_old=`echo "$recent_commits" | tail -n1` + echo "the most recent commits are:" + echo "$recent_commits" +else + commit_new=$1 + commit_old=$2 +fi + +if [ ! -d $TMP_DIR ]; then + mkdir $TMP_DIR +fi +dir_new="${TMP_DIR}/${commit_new}" +dir_old="${TMP_DIR}/${commit_old}" + +function makestuff() { + echo "make clean" + make clean > /dev/null + echo "make db_sanity_test -j32" + make db_sanity_test -j32 > /dev/null + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to perform 'make db_sanity_test'" + exit 1 + fi +} + +rm -r -f $dir_new +rm -r -f $dir_old + +echo "Running db sanity check with commits $commit_new and $commit_old." + +echo "=============================================================" +echo "Making build $commit_new" +makestuff +mv db_sanity_test new_db_sanity_test +echo "Creating db based on the new commit --- $commit_new" +./new_db_sanity_test $dir_new create + +echo "=============================================================" +echo "Making build $commit_old" +makestuff +mv db_sanity_test old_db_sanity_test +echo "Creating db based on the old commit --- $commit_old" +./old_db_sanity_test $dir_old create + +echo "=============================================================" +echo "Verifying new db $dir_new using the old commit --- $commit_old" +./old_db_sanity_test $dir_new verify +if [ $? -ne 0 ]; then + echo "[ERROR] Verification of $dir_new using commit $commit_old failed." + exit 2 +fi + +echo "=============================================================" +echo "Verifying old db $dir_old using the new commit --- $commit_new" +./new_db_sanity_test $dir_old verify +if [ $? -ne 0 ]; then + echo "[ERROR] Verification of $dir_old using commit $commit_new failed." + exit 2 +fi + +rm old_db_sanity_test +rm new_db_sanity_test + +echo "Auto sanity test passed!" diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc new file mode 100644 index 00000000..70ece2c5 --- /dev/null +++ b/tools/blob_store_bench.cc @@ -0,0 +1,269 @@ +#include +#include +#include + +#include "rocksdb/env.h" +#include "util/blob_store.h" +#include "util/testutil.h" + +#define KB 1024LL +#define MB 1024*1024LL +// BlobStore does costly asserts to make sure it's running correctly, which +// significantly impacts benchmark runtime. +// NDEBUG will compile out those asserts. +#ifndef NDEBUG +#define NDEBUG +#endif + +using namespace rocksdb; +using namespace std; + +// used by all threads +uint64_t timeout_sec; +Env *env; +BlobStore* bs; + +static std::string RandomString(Random* rnd, uint64_t len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +struct Result { + uint32_t writes; + uint32_t reads; + uint32_t deletes; + uint64_t data_written; + uint64_t data_read; + + void print() { + printf("Total writes = %u\n", writes); + printf("Total reads = %u\n", reads); + printf("Total deletes = %u\n", deletes); + printf("Write throughput = %lf MB/s\n", + (double)data_written / (1024*1024.0) / timeout_sec); + printf("Read throughput = %lf MB/s\n", + (double)data_read / (1024*1024.0) / timeout_sec); + printf("Total throughput = %lf MB/s\n", + (double)(data_read + data_written) / (1024*1024.0) / timeout_sec); + } + + Result() { + writes = reads = deletes = data_read = data_written = 0; + } + + Result (uint32_t writes, uint32_t reads, uint32_t deletes, + uint64_t data_written, uint64_t data_read) : + writes(writes), reads(reads), deletes(deletes), + data_written(data_written), data_read(data_read) {} + +}; + +Result operator + (const Result &a, const Result &b) { + return Result(a.writes + b.writes, a.reads + b.reads, + a.deletes + b.deletes, a.data_written + b.data_written, + a.data_read + b.data_read); +} + +struct WorkerThread { + uint64_t data_size_from, data_size_to; + double read_ratio; + uint64_t working_set_size; // start deleting once you reach this + Result result; + atomic stopped; + + WorkerThread(uint64_t data_size_from, uint64_t data_size_to, + double read_ratio, uint64_t working_set_size) : + data_size_from(data_size_from), data_size_to(data_size_to), + read_ratio(read_ratio), working_set_size(working_set_size), + stopped(false) {} + + WorkerThread(const WorkerThread& wt) : + data_size_from(wt.data_size_from), data_size_to(wt.data_size_to), + read_ratio(wt.read_ratio), working_set_size(wt.working_set_size), + stopped(false) {} +}; + +static void WorkerThreadBody(void* arg) { + WorkerThread* t = reinterpret_cast(arg); + Random rnd(5); + string buf; + vector> blobs; + vector random_strings; + + for (int i = 0; i < 10; ++i) { + random_strings.push_back(RandomString(&rnd, t->data_size_to)); + } + + uint64_t total_size = 0; + + uint64_t start_micros = env->NowMicros(); + while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) { + if (blobs.size() && rand() < RAND_MAX * t->read_ratio) { + // read + int bi = rand() % blobs.size(); + Status s = bs->Get(blobs[bi].first, &buf); + assert(s.ok()); + t->result.data_read += buf.size(); + t->result.reads++; + } else { + // write + uint64_t size = rand() % (t->data_size_to - t->data_size_from) + + t->data_size_from; + total_size += size; + string put_str = random_strings[rand() % random_strings.size()]; + blobs.push_back(make_pair(Blob(), size)); + Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first); + assert(s.ok()); + t->result.data_written += size; + t->result.writes++; + } + + while (total_size >= t->working_set_size) { + // delete random + int bi = rand() % blobs.size(); + total_size -= blobs[bi].second; + bs->Delete(blobs[bi].first); + blobs.erase(blobs.begin() + bi); + t->result.deletes++; + } + } + t->stopped.store(true); +} + +Result StartBenchmark(vector& config) { + for (auto w : config) { + env->StartThread(WorkerThreadBody, w); + } + + Result result; + + for (auto w : config) { + while (!w->stopped.load()); + result = result + w->result; + } + + for (auto w : config) { + delete w; + } + + delete bs; + + return result; +} + +vector SetupBenchmarkBalanced() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.5; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +vector SetupBenchmarkWriteHeavy() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.1; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +vector SetupBenchmarkReadHeavy() { + string test_path; + env->GetTestDirectory(&test_path); + test_path.append("/blob_store"); + + // config start + uint32_t block_size = 16*KB; + uint32_t file_size = 1*MB; + double read_write_ratio = 0.9; + uint64_t data_read_from = 16*KB; + uint64_t data_read_to = 32*KB; + int number_of_threads = 10; + uint64_t working_set_size = 5*MB; + timeout_sec = 5; + // config end + + bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); + + vector config; + + for (int i = 0; i < number_of_threads; ++i) { + config.push_back(new WorkerThread(data_read_from, + data_read_to, + read_write_ratio, + working_set_size)); + }; + + return config; +} + +int main(int argc, const char** argv) { + srand(33); + env = Env::Default(); + + { + printf("--- Balanced read/write benchmark ---\n"); + vector config = SetupBenchmarkBalanced(); + Result r = StartBenchmark(config); + r.print(); + } + { + printf("--- Write heavy benchmark ---\n"); + vector config = SetupBenchmarkWriteHeavy(); + Result r = StartBenchmark(config); + r.print(); + } + { + printf("--- Read heavy benchmark ---\n"); + vector config = SetupBenchmarkReadHeavy(); + Result r = StartBenchmark(config); + r.print(); + } + + return 0; +} diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py new file mode 100644 index 00000000..d81fd088 --- /dev/null +++ b/tools/db_crashtest.py @@ -0,0 +1,149 @@ +#! /usr/bin/env python +import os +import re +import sys +import time +import random +import getopt +import logging +import tempfile +import subprocess +import shutil + +# This script runs and kills db_stress multiple times. It checks consistency +# in case of unsafe crashes in RocksDB. + +def main(argv): + try: + opts, args = getopt.getopt(argv, "hd:t:i:o:b:") + except getopt.GetoptError: + print("db_crashtest.py -d -t <#threads> " + "-i -o " + "-b \n") + sys.exit(2) + + # default values, will be overridden by cmdline args + interval = 120 # time for one db_stress instance to run + duration = 6000 # total time for this script to test db_stress + threads = 32 + # since we will be killing anyway, use large value for ops_per_thread + ops_per_thread = 100000000 + write_buf_size = 4 * 1024 * 1024 + + for opt, arg in opts: + if opt == '-h': + print("db_crashtest.py -d " + " -t <#threads> -i " + " -o -b \n") + sys.exit() + elif opt == "-d": + duration = int(arg) + elif opt == "-t": + threads = int(arg) + elif opt == "-i": + interval = int(arg) + elif opt == "-o": + ops_per_thread = int(arg) + elif opt == "-b": + write_buf_size = int(arg) + else: + print("db_crashtest.py -d " + " -t <#threads> -i " + " -o -b \n") + sys.exit(2) + + exit_time = time.time() + duration + + print("Running blackbox-crash-test with \ninterval_between_crash=" + + str(interval) + "\ntotal-duration=" + str(duration) + + "\nthreads=" + str(threads) + "\nops_per_thread=" + + str(ops_per_thread) + "\nwrite_buffer_size=" + + str(write_buf_size) + "\n") + + dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_') + + while time.time() < exit_time: + run_had_errors = False + killtime = time.time() + interval + + cmd = re.sub('\s+', ' ', """ + ./db_stress + --test_batches_snapshots=1 + --ops_per_thread=%s + --threads=%s + --write_buffer_size=%s + --destroy_db_initially=0 + --reopen=20 + --readpercent=45 + --prefixpercent=5 + --writepercent=35 + --delpercent=5 + --iterpercent=10 + --db=%s + --max_key=100000000 + --disable_seek_compaction=%s + --mmap_read=%s + --block_size=16384 + --cache_size=1048576 + --open_files=500000 + --verify_checksum=1 + --sync=0 + --disable_wal=0 + --disable_data_sync=1 + --target_file_size_base=2097152 + --target_file_size_multiplier=2 + --max_write_buffer_number=3 + --max_background_compactions=20 + --max_bytes_for_level_base=10485760 + --filter_deletes=%s + --memtablerep=prefix_hash + --prefix_size=7 + """ % (ops_per_thread, + threads, + write_buf_size, + dbname, + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1))) + + child = subprocess.Popen([cmd], + stderr=subprocess.PIPE, shell=True) + print("Running db_stress with pid=%d: %s\n\n" + % (child.pid, cmd)) + + stop_early = False + while time.time() < killtime: + if child.poll() is not None: + print("WARNING: db_stress ended before kill: exitcode=%d\n" + % child.returncode) + stop_early = True + break + time.sleep(1) + + if not stop_early: + if child.poll() is not None: + print("WARNING: db_stress ended before kill: exitcode=%d\n" + % child.returncode) + else: + child.kill() + print("KILLED %d\n" % child.pid) + time.sleep(1) # time to stabilize after a kill + + while True: + line = child.stderr.readline().strip() + if line != '': + run_had_errors = True + print('***' + line + '^') + else: + break + + if run_had_errors: + sys.exit(2) + + time.sleep(1) # time to stabilize before the next run + + # we need to clean up after ourselves -- only do this on test success + shutil.rmtree(dbname, True) + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py new file mode 100644 index 00000000..274fcde4 --- /dev/null +++ b/tools/db_crashtest2.py @@ -0,0 +1,167 @@ +#! /usr/bin/env python +import os +import re +import sys +import time +import random +import getopt +import logging +import tempfile +import subprocess +import shutil + +# This python script runs db_stress multiple times. Some runs with +# kill_random_test that causes rocksdb to crash at various points in code. + +def main(argv): + try: + opts, args = getopt.getopt(argv, "hd:t:k:o:b:") + except getopt.GetoptError: + print str(getopt.GetoptError) + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o "\ + "-b \n" + sys.exit(2) + + # default values, will be overridden by cmdline args + kill_random_test = 97 # kill with probability 1/97 by default + duration = 10000 # total time for this script to test db_stress + threads = 32 + ops_per_thread = 200000 + write_buf_size = 4 * 1024 * 1024 + + for opt, arg in opts: + if opt == '-h': + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o " \ + "-b \n" + sys.exit() + elif opt == "-d": + duration = int(arg) + elif opt == "-t": + threads = int(arg) + elif opt == "-k": + kill_random_test = int(arg) + elif opt == "-o": + ops_per_thread = int(arg) + elif opt == "-b": + write_buf_size = int(arg) + else: + print "unrecognized option " + str(opt) + "\n" + print "db_crashtest2.py -d -t <#threads> " \ + "-k -o " \ + "-b \n" + sys.exit(2) + + exit_time = time.time() + duration + + print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \ + + "\nthreads=" + str(threads) + "\nops_per_thread=" \ + + str(ops_per_thread) + "\nwrite_buffer_size=" \ + + str(write_buf_size) + "\n" + + total_check_mode = 3 + check_mode = 0 + + while time.time() < exit_time: + killoption = "" + if check_mode == 0: + # run with kill_random_test + killoption = " --kill_random_test=" + str(kill_random_test) + # use large ops per thread since we will kill it anyway + additional_opts = "--ops_per_thread=" + \ + str(100 * ops_per_thread) + killoption + elif check_mode == 1: + # normal run with universal compaction mode + additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \ + " --compaction_style=1" + else: + # nomral run + additional_opts = "--ops_per_thread=" + str(ops_per_thread) + + dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_') + cmd = re.sub('\s+', ' ', """ + ./db_stress + --test_batches_snapshots=%s + --threads=%s + --write_buffer_size=%s + --destroy_db_initially=0 + --reopen=20 + --readpercent=45 + --prefixpercent=5 + --writepercent=35 + --delpercent=5 + --iterpercent=10 + --db=%s + --max_key=100000000 + --disable_seek_compaction=%s + --mmap_read=%s + --block_size=16384 + --cache_size=1048576 + --open_files=500000 + --verify_checksum=1 + --sync=0 + --disable_wal=0 + --disable_data_sync=1 + --target_file_size_base=2097152 + --target_file_size_multiplier=2 + --max_write_buffer_number=3 + --max_background_compactions=20 + --max_bytes_for_level_base=10485760 + --filter_deletes=%s + --memtablerep=prefix_hash + --prefix_size=7 + %s + """ % (random.randint(0, 1), + threads, + write_buf_size, + dbname, + random.randint(0, 1), + random.randint(0, 1), + random.randint(0, 1), + additional_opts)) + + print "Running:" + cmd + "\n" + + popen = subprocess.Popen([cmd], stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True) + stdoutdata, stderrdata = popen.communicate() + retncode = popen.returncode + msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format( + check_mode, killoption, retncode)) + print msg + print stdoutdata + + expected = False + if (killoption == '') and (retncode == 0): + # we expect zero retncode if no kill option + expected = True + elif killoption != '' and retncode < 0: + # we expect negative retncode if kill option was given + expected = True + + if not expected: + print "TEST FAILED. See kill option and exit code above!!!\n" + sys.exit(1) + + stdoutdata = stdoutdata.lower() + errorcount = (stdoutdata.count('error') - + stdoutdata.count('got errors 0 times')) + print "#times error occurred in output is " + str(errorcount) + "\n" + + if (errorcount > 0): + print "TEST FAILED. Output has 'error'!!!\n" + sys.exit(2) + if (stdoutdata.find('fail') >= 0): + print "TEST FAILED. Output has 'fail'!!!\n" + sys.exit(2) + # we need to clean up after ourselves -- only do this on test success + shutil.rmtree(dbname, True) + + check_mode = (check_mode + 1) % total_check_mode + + time.sleep(1) # time to stabilize after a kill + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc new file mode 100644 index 00000000..27cb6d5a --- /dev/null +++ b/tools/db_repl_stress.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include + +#include + +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/types.h" +#include "port/atomic_pointer.h" +#include "util/testutil.h" + + +// Run a thread to perform Put's. +// Another thread uses GetUpdatesSince API to keep getting the updates. +// options : +// --num_inserts = the num of inserts the first thread should perform. +// --wal_ttl = the wal ttl for the run. + +using namespace rocksdb; + +struct DataPumpThread { + size_t no_records; + DB* db; // Assumption DB is Open'ed already. +}; + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +static void DataPumpThreadBody(void* arg) { + DataPumpThread* t = reinterpret_cast(arg); + DB* db = t->db; + Random rnd(301); + size_t i = 0; + while(i++ < t->no_records) { + if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)), + Slice(RandomString(&rnd, 500))).ok()) { + fprintf(stderr, "Error in put\n"); + exit(1); + } + } +} + +struct ReplicationThread { + port::AtomicPointer stop; + DB* db; + volatile size_t no_read; +}; + +static void ReplicationThreadBody(void* arg) { + ReplicationThread* t = reinterpret_cast(arg); + DB* db = t->db; + unique_ptr iter; + SequenceNumber currentSeqNum = 1; + while (t->stop.Acquire_Load() != nullptr) { + iter.reset(); + Status s; + while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) { + if (t->stop.Acquire_Load() == nullptr) { + return; + } + } + fprintf(stderr, "Refreshing iterator\n"); + for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) { + BatchResult res = iter->GetBatch(); + if (res.sequence != currentSeqNum) { + fprintf(stderr, + "Missed a seq no. b/w %ld and %ld\n", + (long)currentSeqNum, + (long)res.sequence); + exit(1); + } + } + } +} + +DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should" + " perform."); +DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)"); +DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run" + "(in MB)"); + +int main(int argc, const char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " --num_inserts= --wal_ttl_seconds=" + + " --wal_size_limit_MB="); + google::ParseCommandLineFlags(&argc, const_cast(&argv), true); + + Env* env = Env::Default(); + std::string default_db_path; + env->GetTestDirectory(&default_db_path); + default_db_path += "db_repl_stress"; + Options options; + options.create_if_missing = true; + options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; + options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + DB* db; + DestroyDB(default_db_path, options); + + Status s = DB::Open(options, default_db_path, &db); + + if (!s.ok()) { + fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str()); + exit(1); + } + + DataPumpThread dataPump; + dataPump.no_records = FLAGS_num_inserts; + dataPump.db = db; + env->StartThread(DataPumpThreadBody, &dataPump); + + ReplicationThread replThread; + replThread.db = db; + replThread.no_read = 0; + replThread.stop.Release_Store(env); // store something to make it non-null. + + env->StartThread(ReplicationThreadBody, &replThread); + while(replThread.no_read < FLAGS_num_inserts); + replThread.stop.Release_Store(nullptr); + if (replThread.no_read < dataPump.no_records) { + // no. read should be => than inserted. + fprintf(stderr, "No. of Record's written and read not same\nRead : %zu" + " Written : %zu\n", replThread.no_read, dataPump.no_records); + exit(1); + } + fprintf(stderr, "Successful!\n"); + exit(0); +} diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc new file mode 100644 index 00000000..9642e9e5 --- /dev/null +++ b/tools/db_sanity_test.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "include/rocksdb/db.h" +#include "include/rocksdb/options.h" +#include "include/rocksdb/env.h" +#include "include/rocksdb/slice.h" +#include "include/rocksdb/status.h" +#include "include/rocksdb/comparator.h" +#include "include/rocksdb/table.h" +#include "include/rocksdb/slice_transform.h" + +namespace rocksdb { + +class SanityTest { + public: + explicit SanityTest(const std::string& path) + : env_(Env::Default()), path_(path) { + env_->CreateDirIfMissing(path); + } + virtual ~SanityTest() {} + + virtual std::string Name() const = 0; + virtual Options GetOptions() const = 0; + + Status Create() { + Options options = GetOptions(); + options.create_if_missing = true; + std::string dbname = path_ + Name(); + DestroyDB(dbname, options); + DB* db; + Status s = DB::Open(options, dbname, &db); + std::unique_ptr db_guard(db); + if (!s.ok()) { + return s; + } + for (int i = 0; i < 1000000; ++i) { + std::string k = "key" + std::to_string(i); + std::string v = "value" + std::to_string(i); + s = db->Put(WriteOptions(), Slice(k), Slice(v)); + if (!s.ok()) { + return s; + } + } + return Status::OK(); + } + Status Verify() { + DB* db; + std::string dbname = path_ + Name(); + Status s = DB::Open(GetOptions(), dbname, &db); + std::unique_ptr db_guard(db); + if (!s.ok()) { + return s; + } + for (int i = 0; i < 1000000; ++i) { + std::string k = "key" + std::to_string(i); + std::string v = "value" + std::to_string(i); + std::string result; + s = db->Get(ReadOptions(), Slice(k), &result); + if (!s.ok()) { + return s; + } + if (result != v) { + return Status::Corruption("Unexpected value for key " + k); + } + } + return Status::OK(); + } + + private: + Env* env_; + std::string const path_; +}; + +class SanityTestBasic : public SanityTest { + public: + explicit SanityTestBasic(const std::string& path) : SanityTest(path) {} + virtual Options GetOptions() const { + Options options; + options.create_if_missing = true; + return options; + } + virtual std::string Name() const { return "Basic"; } +}; + +class SanityTestSpecialComparator : public SanityTest { + public: + explicit SanityTestSpecialComparator(const std::string& path) + : SanityTest(path) { + options_.comparator = new NewComparator(); + } + ~SanityTestSpecialComparator() { delete options_.comparator; } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "SpecialComparator"; } + + private: + class NewComparator : public Comparator { + public: + virtual const char* Name() const { return "rocksdb.NewComparator"; } + virtual int Compare(const Slice& a, const Slice& b) const { + return BytewiseComparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* s, const Slice& l) const { + BytewiseComparator()->FindShortestSeparator(s, l); + } + virtual void FindShortSuccessor(std::string* key) const { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options options_; +}; + +class SanityTestZlibCompression : public SanityTest { + public: + explicit SanityTestZlibCompression(const std::string& path) + : SanityTest(path) { + options_.compression = kZlibCompression; + } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "ZlibCompression"; } + + private: + Options options_; +}; + +class SanityTestPlainTableFactory : public SanityTest { + public: + explicit SanityTestPlainTableFactory(const std::string& path) + : SanityTest(path) { + options_.table_factory.reset(NewPlainTableFactory()); + options_.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options_.allow_mmap_reads = true; + } + ~SanityTestPlainTableFactory() {} + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "PlainTable"; } + + private: + Options options_; +}; + +bool RunSanityTests(const std::string& command, const std::string& path) { + std::vector sanity_tests = { + new SanityTestBasic(path), + new SanityTestSpecialComparator(path), + new SanityTestZlibCompression(path), + new SanityTestPlainTableFactory(path)}; + + if (command == "create") { + fprintf(stderr, "Creating...\n"); + } else { + fprintf(stderr, "Verifying...\n"); + } + for (auto sanity_test : sanity_tests) { + Status s; + fprintf(stderr, "%s -- ", sanity_test->Name().c_str()); + if (command == "create") { + s = sanity_test->Create(); + } else { + assert(command == "verify"); + s = sanity_test->Verify(); + } + fprintf(stderr, "%s\n", s.ToString().c_str()); + if (!s.ok()) { + fprintf(stderr, "FAIL\n"); + return false; + } + + delete sanity_test; + } + return true; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + std::string path, command; + bool ok = (argc == 3); + if (ok) { + path = std::string(argv[1]); + command = std::string(argv[2]); + ok = (command == "create" || command == "verify"); + } + if (!ok) { + fprintf(stderr, "Usage: %s [create|verify] \n", argv[0]); + exit(1); + } + if (path.back() != '/') { + path += "/"; + } + + bool sanity_ok = rocksdb::RunSanityTests(command, path); + + return sanity_ok ? 0 : 1; +} diff --git a/tools/db_stress.cc b/tools/db_stress.cc new file mode 100644 index 00000000..32404c65 --- /dev/null +++ b/tools/db_stress.cc @@ -0,0 +1,1565 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The test uses an array to compare against values written to the database. +// Keys written to the array are in 1:1 correspondence to the actual values in +// the database according to the formula in the function GenerateValue. + +// Space is reserved in the array from 0 to FLAGS_max_key and values are +// randomly written/deleted/read from those positions. During verification we +// compare all the positions in the array. To shorten/elongate the running +// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread, +// (sometimes also FLAGS_threads). +// +// NOTE that if FLAGS_test_batches_snapshots is set, the test will have +// different behavior. See comment of the flag for details. + +#include +#include +#include +#include +#include "db/db_impl.h" +#include "db/version_set.h" +#include "rocksdb/statistics.h" +#include "rocksdb/cache.h" +#include "utilities/utility_db.h" +#include "rocksdb/env.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/logging.h" +#include "utilities/ttl/db_ttl.h" +#include "hdfs/env_hdfs.h" +#include "utilities/merge_operators.h" + +static const long KB = 1024; + + +static bool ValidateUint32Range(const char* flagname, uint64_t value) { + if (value > std::numeric_limits::max()) { + fprintf(stderr, + "Invalid value for --%s: %lu, overflow\n", + flagname, + (unsigned long)value); + return false; + } + return true; +} +DEFINE_uint64(seed, 2341234, "Seed for PRNG"); +static const bool FLAGS_seed_dummy = + google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); + +DEFINE_int64(max_key, 1 * KB * KB * KB, + "Max number of key/values to place in database"); + +DEFINE_bool(test_batches_snapshots, false, + "If set, the test uses MultiGet(), MultiPut() and MultiDelete()" + " which read/write/delete multiple keys in a batch. In this mode," + " we do not verify db content by comparing the content with the " + "pre-allocated array. Instead, we do partial verification inside" + " MultiGet() by checking various values in a batch. Benefit of" + " this mode:\n" + "\t(a) No need to acquire mutexes during writes (less cache " + "flushes in multi-core leading to speed up)\n" + "\t(b) No long validation at the end (more speed up)\n" + "\t(c) Test snapshot and atomicity of batch writes"); + +DEFINE_int32(threads, 32, "Number of concurrent threads to run."); + +DEFINE_int32(ttl, -1, + "Opens the db with this ttl value if this is not -1. " + "Carefully specify a large value such that verifications on " + "deleted values don't fail"); + +DEFINE_int32(value_size_mult, 8, + "Size of value will be this number times rand_int(1,3) bytes"); + +DEFINE_bool(verify_before_write, false, "Verify before write"); + +DEFINE_bool(histogram, false, "Print histogram of operation timings"); + +DEFINE_bool(destroy_db_initially, true, + "Destroys the database dir before start if this is true"); + +DEFINE_bool (verbose, false, "Verbose"); + +DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, + "Number of bytes to buffer in memtable before compacting"); + +DEFINE_int32(max_write_buffer_number, + rocksdb::Options().max_write_buffer_number, + "The number of in-memory memtables. " + "Each memtable is of size FLAGS_write_buffer_size."); + +DEFINE_int32(min_write_buffer_number_to_merge, + rocksdb::Options().min_write_buffer_number_to_merge, + "The minimum number of write buffers that will be merged together " + "before writing to storage. This is cheap because it is an " + "in-memory merge. If this feature is not enabled, then all these " + "write buffers are flushed to L0 as separate files and this " + "increases read amplification because a get request has to check " + "in all of these files. Also, an in-memory merge may result in " + "writing less data to storage if there are duplicate records in" + " each of these individual write buffers."); + +DEFINE_int32(open_files, rocksdb::Options().max_open_files, + "Maximum number of files to keep open at the same time " + "(use default if == 0)"); + +DEFINE_int64(compressed_cache_size, -1, + "Number of bytes to use as a cache of compressed data." + " Negative means use default settings."); + +DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, ""); + +DEFINE_int32(level0_file_num_compaction_trigger, + rocksdb::Options().level0_file_num_compaction_trigger, + "Level0 compaction start trigger"); + +DEFINE_int32(level0_slowdown_writes_trigger, + rocksdb::Options().level0_slowdown_writes_trigger, + "Number of files in level-0 that will slow down writes"); + +DEFINE_int32(level0_stop_writes_trigger, + rocksdb::Options().level0_stop_writes_trigger, + "Number of files in level-0 that will trigger put stop."); + +DEFINE_int32(block_size, rocksdb::Options().block_size, + "Number of bytes in a block."); + +DEFINE_int32(max_background_compactions, + rocksdb::Options().max_background_compactions, + "The maximum number of concurrent background compactions " + "that can occur in parallel."); + +DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger" + " compaction in universal style"); + +DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to " + "compact in universal style compaction"); + +DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" + " in universal style compaction"); + +DEFINE_int32(universal_max_size_amplification_percent, 0, + "The max size amplification for universal style compaction"); + +DEFINE_int64(cache_size, 2 * KB * KB * KB, + "Number of bytes to use as a cache of uncompressed data."); + +static bool ValidateInt32Positive(const char* flagname, int32_t value) { + if (value < 0) { + fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(reopen, 10, "Number of times database reopens"); +static const bool FLAGS_reopen_dummy = + google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); + +DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " + "Negative means use default settings."); + +DEFINE_string(db, "", "Use the db with the following name."); + +DEFINE_bool(verify_checksum, false, + "Verify checksum for every block read from storage"); + +DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads, + "Allow reads to occur via mmap-ing files"); + +// Database statistics +static std::shared_ptr dbstats; +DEFINE_bool(statistics, false, "Create database statistics"); + +DEFINE_bool(sync, false, "Sync all writes to disk"); + +DEFINE_bool(disable_data_sync, false, + "If true, do not wait until data is synced to disk."); + +DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); + +DEFINE_int32(kill_random_test, 0, + "If non-zero, kill at various points in source code with " + "probability 1/this"); +static const bool FLAGS_kill_random_test_dummy = + google::RegisterFlagValidator(&FLAGS_kill_random_test, + &ValidateInt32Positive); +extern int rocksdb_kill_odds; + +DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); + +DEFINE_int32(target_file_size_base, 64 * KB, + "Target level-1 file size for compaction"); + +DEFINE_int32(target_file_size_multiplier, 1, + "A multiplier to compute targe level-N file size (N >= 2)"); + +DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1"); + +DEFINE_int32(max_bytes_for_level_multiplier, 2, + "A multiplier to compute max bytes for level-N (N >= 2)"); + +static bool ValidateInt32Percent(const char* flagname, int32_t value) { + if (value < 0 || value>100) { + fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(readpercent, 10, + "Ratio of reads to total workload (expressed as a percentage)"); +static const bool FLAGS_readpercent_dummy = + google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent); + +DEFINE_int32(prefixpercent, 20, + "Ratio of prefix iterators to total workload (expressed as a" + " percentage)"); +static const bool FLAGS_prefixpercent_dummy = + google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent); + +DEFINE_int32(writepercent, 45, + " Ratio of deletes to total workload (expressed as a percentage)"); +static const bool FLAGS_writepercent_dummy = + google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent); + +DEFINE_int32(delpercent, 15, + "Ratio of deletes to total workload (expressed as a percentage)"); +static const bool FLAGS_delpercent_dummy = + google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent); + +DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload" + " (expressed as a percentage)"); +static const bool FLAGS_iterpercent_dummy = + google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent); + +DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); +static const bool FLAGS_num_iterations_dummy = + google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); + +DEFINE_bool(disable_seek_compaction, false, + "Option to disable compation triggered by read."); + +DEFINE_uint64(delete_obsolete_files_period_micros, 0, + "Option to delete obsolete files periodically" + "0 means that obsolete files are " + " deleted after every compaction run."); + +enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "none")) + return rocksdb::kNoCompression; + else if (!strcasecmp(ctype, "snappy")) + return rocksdb::kSnappyCompression; + else if (!strcasecmp(ctype, "zlib")) + return rocksdb::kZlibCompression; + else if (!strcasecmp(ctype, "bzip2")) + return rocksdb::kBZip2Compression; + else if (!strcasecmp(ctype, "lz4")) + return rocksdb::kLZ4Compression; + else if (!strcasecmp(ctype, "lz4hc")) + return rocksdb::kLZ4HCCompression; + + fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); + return rocksdb::kSnappyCompression; //default value +} +DEFINE_string(compression_type, "snappy", + "Algorithm to use to compress the database"); +static enum rocksdb::CompressionType FLAGS_compression_type_e = + rocksdb::kSnappyCompression; + +DEFINE_string(hdfs, "", "Name of hdfs environment"); +// posix or hdfs environment +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread."); +static const bool FLAGS_ops_per_thread_dummy = + google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range); + +DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock"); +static const bool FLAGS_log2_keys_per_lock_dummy = + google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock, + &ValidateUint32Range); + +DEFINE_int32(purge_redundant_percent, 50, + "Percentage of times we want to purge redundant keys in memory " + "before flushing"); +static const bool FLAGS_purge_redundant_percent_dummy = + google::RegisterFlagValidator(&FLAGS_purge_redundant_percent, + &ValidateInt32Percent); + +DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop" + " the delete if key not present"); + +enum RepFactory { + kSkipList, + kHashSkipList, + kVectorRep +}; +enum RepFactory StringToRepFactory(const char* ctype) { + assert(ctype); + + if (!strcasecmp(ctype, "skip_list")) + return kSkipList; + else if (!strcasecmp(ctype, "prefix_hash")) + return kHashSkipList; + else if (!strcasecmp(ctype, "vector")) + return kVectorRep; + + fprintf(stdout, "Cannot parse memreptable %s\n", ctype); + return kSkipList; +} +static enum RepFactory FLAGS_rep_factory; +DEFINE_string(memtablerep, "prefix_hash", ""); + +static bool ValidatePrefixSize(const char* flagname, int32_t value) { + if (value < 0 || value > 8) { + fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n", + flagname, value); + return false; + } + return true; +} +DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep"); +static const bool FLAGS_prefix_size_dummy = + google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); + +DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge " + "that behaves like a Put"); + + +namespace rocksdb { + +// convert long to a big-endian slice key +static std::string Key(long val) { + std::string little_endian_key; + std::string big_endian_key; + PutFixed64(&little_endian_key, val); + assert(little_endian_key.size() == sizeof(val)); + big_endian_key.resize(sizeof(val)); + for (int i=0; i<(int)sizeof(val); i++) { + big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i]; + } + return big_endian_key; +} + +class StressTest; +namespace { + +class Stats { + private: + double start_; + double finish_; + double seconds_; + long done_; + long gets_; + long prefixes_; + long writes_; + long deletes_; + long iterator_size_sums_; + long founds_; + long iterations_; + long errors_; + int next_report_; + size_t bytes_; + double last_op_finish_; + HistogramImpl hist_; + + public: + Stats() { } + + void Start() { + next_report_ = 100; + hist_.Clear(); + done_ = 0; + gets_ = 0; + prefixes_ = 0; + writes_ = 0; + deletes_ = 0; + iterator_size_sums_ = 0; + founds_ = 0; + iterations_ = 0; + errors_ = 0; + bytes_ = 0; + seconds_ = 0; + start_ = FLAGS_env->NowMicros(); + last_op_finish_ = start_; + finish_ = start_; + } + + void Merge(const Stats& other) { + hist_.Merge(other.hist_); + done_ += other.done_; + gets_ += other.gets_; + prefixes_ += other.prefixes_; + writes_ += other.writes_; + deletes_ += other.deletes_; + iterator_size_sums_ += other.iterator_size_sums_; + founds_ += other.founds_; + iterations_ += other.iterations_; + errors_ += other.errors_; + bytes_ += other.bytes_; + seconds_ += other.seconds_; + if (other.start_ < start_) start_ = other.start_; + if (other.finish_ > finish_) finish_ = other.finish_; + } + + void Stop() { + finish_ = FLAGS_env->NowMicros(); + seconds_ = (finish_ - start_) * 1e-6; + } + + void FinishedSingleOp() { + if (FLAGS_histogram) { + double now = FLAGS_env->NowMicros(); + double micros = now - last_op_finish_; + hist_.Add(micros); + if (micros > 20000) { + fprintf(stdout, "long op: %.1f micros%30s\r", micros, ""); + } + last_op_finish_ = now; + } + + done_++; + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); + } + } + + void AddBytesForWrites(int nwrites, size_t nbytes) { + writes_ += nwrites; + bytes_ += nbytes; + } + + void AddGets(int ngets, int nfounds) { + founds_ += nfounds; + gets_ += ngets; + } + + void AddPrefixes(int nprefixes, int count) { + prefixes_ += nprefixes; + iterator_size_sums_ += count; + } + + void AddIterations(int n) { + iterations_ += n; + } + + void AddDeletes(int n) { + deletes_ += n; + } + + void AddErrors(int n) { + errors_ += n; + } + + void Report(const char* name) { + std::string extra; + if (bytes_ < 1 || done_ < 1) { + fprintf(stderr, "No writes or ops?\n"); + return; + } + + double elapsed = (finish_ - start_) * 1e-6; + double bytes_mb = bytes_ / 1048576.0; + double rate = bytes_mb / elapsed; + double throughput = (double)done_/elapsed; + + fprintf(stdout, "%-12s: ", name); + fprintf(stdout, "%.3f micros/op %ld ops/sec\n", + seconds_ * 1e6 / done_, (long)throughput); + fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n", + "", bytes_mb, rate, (100*writes_)/done_, done_); + fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_); + fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_); + fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", + gets_, founds_); + fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_); + fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "", + iterator_size_sums_); + fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_); + fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_); + + if (FLAGS_histogram) { + fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); + } + fflush(stdout); + } +}; + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + static const uint32_t SENTINEL = 0xffffffff; + + explicit SharedState(StressTest* stress_test) : + cv_(&mu_), + seed_(FLAGS_seed), + max_key_(FLAGS_max_key), + log2_keys_per_lock_(FLAGS_log2_keys_per_lock), + num_threads_(FLAGS_threads), + num_initialized_(0), + num_populated_(0), + vote_reopen_(0), + num_done_(0), + start_(false), + start_verify_(false), + stress_test_(stress_test) { + if (FLAGS_test_batches_snapshots) { + key_locks_ = nullptr; + values_ = nullptr; + fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); + return; + } + values_ = new uint32_t[max_key_]; + for (long i = 0; i < max_key_; i++) { + values_[i] = SENTINEL; + } + + long num_locks = (max_key_ >> log2_keys_per_lock_); + if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) { + num_locks ++; + } + fprintf(stdout, "Creating %ld locks\n", num_locks); + key_locks_ = new port::Mutex[num_locks]; + } + + ~SharedState() { + delete[] values_; + delete[] key_locks_; + } + + port::Mutex* GetMutex() { + return &mu_; + } + + port::CondVar* GetCondVar() { + return &cv_; + } + + StressTest* GetStressTest() const { + return stress_test_; + } + + long GetMaxKey() const { + return max_key_; + } + + uint32_t GetNumThreads() const { + return num_threads_; + } + + void IncInitialized() { + num_initialized_++; + } + + void IncOperated() { + num_populated_++; + } + + void IncDone() { + num_done_++; + } + + void IncVotedReopen() { + vote_reopen_ = (vote_reopen_ + 1) % num_threads_; + } + + bool AllInitialized() const { + return num_initialized_ >= num_threads_; + } + + bool AllOperated() const { + return num_populated_ >= num_threads_; + } + + bool AllDone() const { + return num_done_ >= num_threads_; + } + + bool AllVotedReopen() { + return (vote_reopen_ == 0); + } + + void SetStart() { + start_ = true; + } + + void SetStartVerify() { + start_verify_ = true; + } + + bool Started() const { + return start_; + } + + bool VerifyStarted() const { + return start_verify_; + } + + port::Mutex* GetMutexForKey(long key) { + return &key_locks_[key >> log2_keys_per_lock_]; + } + + void Put(long key, uint32_t value_base) { + values_[key] = value_base; + } + + uint32_t Get(long key) const { + return values_[key]; + } + + void Delete(long key) const { + values_[key] = SENTINEL; + } + + uint32_t GetSeed() const { + return seed_; + } + + private: + port::Mutex mu_; + port::CondVar cv_; + const uint32_t seed_; + const long max_key_; + const uint32_t log2_keys_per_lock_; + const int num_threads_; + long num_initialized_; + long num_populated_; + long vote_reopen_; + long num_done_; + bool start_; + bool start_verify_; + StressTest* stress_test_; + + uint32_t *values_; + port::Mutex *key_locks_; + +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; // 0..n-1 + Random rand; // Has different seeds for different threads + SharedState* shared; + Stats stats; + + ThreadState(uint32_t index, SharedState *shared) + : tid(index), + rand(1000 + index + shared->GetSeed()), + shared(shared) { + } +}; + +} // namespace + +class StressTest { + public: + StressTest() + : cache_(NewLRUCache(FLAGS_cache_size)), + compressed_cache_(FLAGS_compressed_cache_size >= 0 ? + NewLRUCache(FLAGS_compressed_cache_size) : + nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), + db_(nullptr), + num_times_reopened_(0) { + if (FLAGS_destroy_db_initially) { + std::vector files; + FLAGS_env->GetChildren(FLAGS_db, &files); + for (unsigned int i = 0; i < files.size(); i++) { + if (Slice(files[i]).starts_with("heap-")) { + FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]); + } + } + DestroyDB(FLAGS_db, Options()); + } + } + + ~StressTest() { + delete db_; + delete filter_policy_; + } + + void Run() { + PrintEnv(); + Open(); + SharedState shared(this); + uint32_t n = shared.GetNumThreads(); + + std::vector threads(n); + for (uint32_t i = 0; i < n; i++) { + threads[i] = new ThreadState(i, &shared); + FLAGS_env->StartThread(ThreadBody, threads[i]); + } + // Each thread goes through the following states: + // initializing -> wait for others to init -> read/populate/depopulate + // wait for others to operate -> verify -> done + + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + + double now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Starting database operations\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + while (!shared.AllOperated()) { + shared.GetCondVar()->Wait(); + } + + now = FLAGS_env->NowMicros(); + if (FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Limited verification already done during gets\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } else { + fprintf(stdout, "%s Starting verification\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } + + shared.SetStartVerify(); + shared.GetCondVar()->SignalAll(); + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + } + + for (unsigned int i = 1; i < n; i++) { + threads[0]->stats.Merge(threads[i]->stats); + } + threads[0]->stats.Report("Stress Test"); + + for (unsigned int i = 0; i < n; i++) { + delete threads[i]; + threads[i] = nullptr; + } + double now = FLAGS_env->NowMicros(); + if (!FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Verification successful\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str()); + } + PrintStatistics(); + } + + private: + + static void ThreadBody(void* v) { + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetStressTest()->OperateDb(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncOperated(); + if (shared->AllOperated()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->VerifyStarted()) { + shared->GetCondVar()->Wait(); + } + } + + if (!FLAGS_test_batches_snapshots) { + thread->shared->GetStressTest()->VerifyDb(thread); + } + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + + } + + // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... + // ("9"+K, "9"+V) in DB atomically i.e in a single batch. + // Also refer MultiGet. + Status MultiPut(ThreadState* thread, + const WriteOptions& writeoptions, + const Slice& key, const Slice& value, size_t sz) { + std::string keys[10] = {"9", "8", "7", "6", "5", + "4", "3", "2", "1", "0"}; + std::string values[10] = {"9", "8", "7", "6", "5", + "4", "3", "2", "1", "0"}; + Slice value_slices[10]; + WriteBatch batch; + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + values[i] += value.ToString(); + value_slices[i] = values[i]; + if (FLAGS_use_merge) { + batch.Merge(keys[i], value_slices[i]); + } else { + batch.Put(keys[i], value_slices[i]); + } + } + + s = db_->Write(writeoptions, &batch); + if (!s.ok()) { + fprintf(stderr, "multiput error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + // we did 10 writes each of size sz + 1 + thread->stats.AddBytesForWrites(10, (sz + 1) * 10); + } + + return s; + } + + // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K) + // in DB atomically i.e in a single batch. Also refer MultiGet. + Status MultiDelete(ThreadState* thread, + const WriteOptions& writeoptions, + const Slice& key) { + std::string keys[10] = {"9", "7", "5", "3", "1", + "8", "6", "4", "2", "0"}; + + WriteBatch batch; + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + batch.Delete(keys[i]); + } + + s = db_->Write(writeoptions, &batch); + if (!s.ok()) { + fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + thread->stats.AddDeletes(10); + } + + return s; + } + + // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K + // in the same snapshot, and verifies that all the values are of the form + // "0"+V, "1"+V,..."9"+V. + // ASSUMES that MultiPut was used to put (K, V) into the DB. + Status MultiGet(ThreadState* thread, + const ReadOptions& readoptions, + const Slice& key, std::string* value) { + std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; + Slice key_slices[10]; + std::string values[10]; + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = db_->GetSnapshot(); + Status s; + for (int i = 0; i < 10; i++) { + keys[i] += key.ToString(); + key_slices[i] = keys[i]; + s = db_->Get(readoptionscopy, key_slices[i], value); + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + values[i] = ""; + thread->stats.AddErrors(1); + // we continue after error rather than exiting so that we can + // find more errors if any + } else if (s.IsNotFound()) { + values[i] = ""; + thread->stats.AddGets(1, 0); + } else { + values[i] = *value; + + char expected_prefix = (keys[i])[0]; + char actual_prefix = (values[i])[0]; + if (actual_prefix != expected_prefix) { + fprintf(stderr, "error expected prefix = %c actual = %c\n", + expected_prefix, actual_prefix); + } + (values[i])[0] = ' '; // blank out the differing character + thread->stats.AddGets(1, 1); + } + } + db_->ReleaseSnapshot(readoptionscopy.snapshot); + + // Now that we retrieved all values, check that they all match + for (int i = 1; i < 10; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", + key.ToString().c_str(), values[0].c_str(), + values[i].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + } + + return s; + } + + // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P + // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes + // of the key. Each of these 10 scans returns a series of values; + // each series should be the same length, and it is verified for each + // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V. + // ASSUMES that MultiPut was used to put (K, V) + Status MultiPrefixScan(ThreadState* thread, + const ReadOptions& readoptions, + const Slice& key) { + std::string prefixes[10] = {"0", "1", "2", "3", "4", + "5", "6", "7", "8", "9"}; + Slice prefix_slices[10]; + ReadOptions readoptionscopy[10]; + const Snapshot* snapshot = db_->GetSnapshot(); + Iterator* iters[10]; + Status s = Status::OK(); + for (int i = 0; i < 10; i++) { + prefixes[i] += key.ToString(); + prefixes[i].resize(FLAGS_prefix_size); + prefix_slices[i] = Slice(prefixes[i]); + readoptionscopy[i] = readoptions; + readoptionscopy[i].prefix_seek = true; + readoptionscopy[i].snapshot = snapshot; + iters[i] = db_->NewIterator(readoptionscopy[i]); + iters[i]->Seek(prefix_slices[i]); + } + + int count = 0; + while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) { + count++; + std::string values[10]; + // get list of all values for this iteration + for (int i = 0; i < 10; i++) { + // no iterator should finish before the first one + assert(iters[i]->Valid() && + iters[i]->key().starts_with(prefix_slices[i])); + values[i] = iters[i]->value().ToString(); + + char expected_first = (prefixes[i])[0]; + char actual_first = (values[i])[0]; + + if (actual_first != expected_first) { + fprintf(stderr, "error expected first = %c actual = %c\n", + expected_first, actual_first); + } + (values[i])[0] = ' '; // blank out the differing character + } + // make sure all values are equivalent + for (int i = 0; i < 10; i++) { + if (values[i] != values[0]) { + fprintf(stderr, "error : inconsistent values for prefix %s: %s, %s\n", + prefixes[i].c_str(), values[0].c_str(), + values[i].c_str()); + // we continue after error rather than exiting so that we can + // find more errors if any + } + iters[i]->Next(); + } + } + + // cleanup iterators and snapshot + for (int i = 0; i < 10; i++) { + // if the first iterator finished, they should have all finished + assert(!iters[i]->Valid() || + !iters[i]->key().starts_with(prefix_slices[i])); + assert(iters[i]->status().ok()); + delete iters[i]; + } + db_->ReleaseSnapshot(snapshot); + + if (s.ok()) { + thread->stats.AddPrefixes(1, count); + } else { + thread->stats.AddErrors(1); + } + + return s; + } + + // Given a key K, this creates an iterator which scans to K and then + // does a random sequence of Next/Prev operations. + Status MultiIterate(ThreadState* thread, + const ReadOptions& readoptions, + const Slice& key) { + Status s; + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions readoptionscopy = readoptions; + readoptionscopy.snapshot = snapshot; + readoptionscopy.prefix_seek = FLAGS_prefix_size > 0; + unique_ptr iter(db_->NewIterator(readoptionscopy)); + + iter->Seek(key); + for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) { + if (thread->rand.OneIn(2)) { + iter->Next(); + } else { + iter->Prev(); + } + } + + if (s.ok()) { + thread->stats.AddIterations(1); + } else { + thread->stats.AddErrors(1); + } + + db_->ReleaseSnapshot(snapshot); + + return s; + } + + void OperateDb(ThreadState* thread) { + ReadOptions read_opts(FLAGS_verify_checksum, true); + WriteOptions write_opts; + char value[100]; + long max_key = thread->shared->GetMaxKey(); + std::string from_db; + if (FLAGS_sync) { + write_opts.sync = true; + } + write_opts.disableWAL = FLAGS_disable_wal; + const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent; + const int writeBound = prefixBound + (int)FLAGS_writepercent; + const int delBound = writeBound + (int)FLAGS_delpercent; + + thread->stats.Start(); + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + if(i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) { + { + thread->stats.FinishedSingleOp(); + MutexLock l(thread->shared->GetMutex()); + thread->shared->IncVotedReopen(); + if (thread->shared->AllVotedReopen()) { + thread->shared->GetStressTest()->Reopen(); + thread->shared->GetCondVar()->SignalAll(); + } + else { + thread->shared->GetCondVar()->Wait(); + } + // Commenting this out as we don't want to reset stats on each open. + // thread->stats.Start(); + } + } + + long rand_key = thread->rand.Next() % max_key; + std::string keystr = Key(rand_key); + Slice key = keystr; + int prob_op = thread->rand.Uniform(100); + + if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { + // OPERATION read + if (!FLAGS_test_batches_snapshots) { + Status s = db_->Get(read_opts, key, &from_db); + if (s.ok()) { + // found case + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + // not found case + thread->stats.AddGets(1, 0); + } else { + // errors case + thread->stats.AddErrors(1); + } + } else { + MultiGet(thread, read_opts, key, &from_db); + } + } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) { + // OPERATION prefix scan + // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are + // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will + // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same + // prefix + if (!FLAGS_test_batches_snapshots) { + Slice prefix = Slice(key.data(), FLAGS_prefix_size); + read_opts.prefix_seek = true; + Iterator* iter = db_->NewIterator(read_opts); + int64_t count = 0; + for (iter->Seek(prefix); + iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { + ++count; + } + assert(count <= + (static_cast(1) << ((8 - FLAGS_prefix_size) * 8))); + if (iter->status().ok()) { + thread->stats.AddPrefixes(1, count); + } else { + thread->stats.AddErrors(1); + } + delete iter; + } else { + MultiPrefixScan(thread, read_opts, key); + } + } else if (prefixBound <= prob_op && prob_op < writeBound) { + // OPERATION write + uint32_t value_base = thread->rand.Next(); + size_t sz = GenerateValue(value_base, value, sizeof(value)); + Slice v(value, sz); + if (!FLAGS_test_batches_snapshots) { + MutexLock l(thread->shared->GetMutexForKey(rand_key)); + if (FLAGS_verify_before_write) { + std::string keystr2 = Key(rand_key); + Slice k = keystr2; + Status s = db_->Get(read_opts, k, &from_db); + VerifyValue(rand_key, + read_opts, + *(thread->shared), + from_db, + s, + true); + } + thread->shared->Put(rand_key, value_base); + if (FLAGS_use_merge) { + db_->Merge(write_opts, key, v); + } else { + db_->Put(write_opts, key, v); + } + thread->stats.AddBytesForWrites(1, sz); + } else { + MultiPut(thread, write_opts, key, v, sz); + } + PrintKeyValue(rand_key, value, sz); + } else if (writeBound <= prob_op && prob_op < delBound) { + // OPERATION delete + if (!FLAGS_test_batches_snapshots) { + MutexLock l(thread->shared->GetMutexForKey(rand_key)); + thread->shared->Delete(rand_key); + db_->Delete(write_opts, key); + thread->stats.AddDeletes(1); + } else { + MultiDelete(thread, write_opts, key); + } + } else { + // OPERATION iterate + MultiIterate(thread, read_opts, key); + } + thread->stats.FinishedSingleOp(); + } + + thread->stats.Stop(); + } + + void VerifyDb(ThreadState* thread) const { + ReadOptions options(FLAGS_verify_checksum, true); + const SharedState& shared = *(thread->shared); + static const long max_key = shared.GetMaxKey(); + static const long keys_per_thread = max_key / shared.GetNumThreads(); + long start = keys_per_thread * thread->tid; + long end = start + keys_per_thread; + if (thread->tid == shared.GetNumThreads() - 1) { + end = max_key; + } + + if (!thread->rand.OneIn(2)) { + options.prefix_seek = FLAGS_prefix_size > 0; + // Use iterator to verify this range + unique_ptr iter(db_->NewIterator(options)); + iter->Seek(Key(start)); + for (long i = start; i < end; i++) { + // TODO(ljin): update "long" to uint64_t + // Reseek when the prefix changes + if (i % (static_cast(1) << 8 * (8 - FLAGS_prefix_size)) == 0) { + iter->Seek(Key(i)); + } + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = iter->status(); + if (iter->Valid()) { + if (iter->key().compare(k) > 0) { + s = Status::NotFound(Slice()); + } else if (iter->key().compare(k) == 0) { + from_db = iter->value().ToString(); + iter->Next(); + } else if (iter->key().compare(k) < 0) { + VerificationAbort("An out of range key was found", i); + } + } else { + // The iterator found no value for the key in question, so do not + // move to the next item in the iterator + s = Status::NotFound(Slice()); + } + VerifyValue(i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(i, from_db.data(), from_db.length()); + } + } + } else { + // Use Get to verify this range + for (long i = start; i < end; i++) { + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = db_->Get(options, k, &from_db); + VerifyValue(i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(i, from_db.data(), from_db.length()); + } + } + } + } + + void VerificationAbort(std::string msg, long key) const { + fprintf(stderr, "Verification failed for key %ld: %s\n", + key, msg.c_str()); + exit(1); + } + + void VerifyValue(long key, + const ReadOptions &opts, + const SharedState &shared, + const std::string &value_from_db, + Status s, + bool strict=false) const { + // compare value_from_db with the value in the shared state + char value[100]; + uint32_t value_base = shared.Get(key); + if (value_base == SharedState::SENTINEL && !strict) { + return; + } + + if (s.ok()) { + if (value_base == SharedState::SENTINEL) { + VerificationAbort("Unexpected value found", key); + } + size_t sz = GenerateValue(value_base, value, sizeof(value)); + if (value_from_db.length() != sz) { + VerificationAbort("Length of value read is not equal", key); + } + if (memcmp(value_from_db.data(), value, sz) != 0) { + VerificationAbort("Contents of value read don't match", key); + } + } else { + if (value_base != SharedState::SENTINEL) { + VerificationAbort("Value not found", key); + } + } + } + + static void PrintKeyValue(uint32_t key, const char *value, size_t sz) { + if (!FLAGS_verbose) return; + fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz); + for (size_t i=0; i= sizeof(uint32_t)); + *((uint32_t*)v) = rand; + for (size_t i=sizeof(uint32_t); i < value_sz; i++) { + v[i] = (char)(rand ^ i); + } + v[value_sz] = '\0'; + return value_sz; // the size of the value set. + } + + void PrintEnv() const { + fprintf(stdout, "LevelDB version : %d.%d\n", + kMajorVersion, kMinorVersion); + fprintf(stdout, "Number of threads : %d\n", FLAGS_threads); + fprintf(stdout, + "Ops per thread : %lu\n", + (unsigned long)FLAGS_ops_per_thread); + std::string ttl_state("unused"); + if (FLAGS_ttl > 0) { + ttl_state = NumberToString(FLAGS_ttl); + } + fprintf(stdout, "Time to live(sec) : %s\n", ttl_state.c_str()); + fprintf(stdout, "Read percentage : %d%%\n", FLAGS_readpercent); + fprintf(stdout, "Prefix percentage : %d%%\n", FLAGS_prefixpercent); + fprintf(stdout, "Write percentage : %d%%\n", FLAGS_writepercent); + fprintf(stdout, "Delete percentage : %d%%\n", FLAGS_delpercent); + fprintf(stdout, "Iterate percentage : %d%%\n", FLAGS_iterpercent); + fprintf(stdout, "Write-buffer-size : %d\n", FLAGS_write_buffer_size); + fprintf(stdout, + "Iterations : %lu\n", + (unsigned long)FLAGS_num_iterations); + fprintf(stdout, + "Max key : %lu\n", + (unsigned long)FLAGS_max_key); + fprintf(stdout, "Ratio #ops/#keys : %f\n", + (1.0 * FLAGS_ops_per_thread * FLAGS_threads)/FLAGS_max_key); + fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen); + fprintf(stdout, "Batches/snapshots : %d\n", + FLAGS_test_batches_snapshots); + fprintf(stdout, "Purge redundant %% : %d\n", + FLAGS_purge_redundant_percent); + fprintf(stdout, "Deletes use filter : %d\n", + FLAGS_filter_deletes); + fprintf(stdout, "Num keys per lock : %d\n", + 1 << FLAGS_log2_keys_per_lock); + + const char* compression = ""; + switch (FLAGS_compression_type_e) { + case rocksdb::kNoCompression: + compression = "none"; + break; + case rocksdb::kSnappyCompression: + compression = "snappy"; + break; + case rocksdb::kZlibCompression: + compression = "zlib"; + break; + case rocksdb::kBZip2Compression: + compression = "bzip2"; + break; + case rocksdb::kLZ4Compression: + compression = "lz4"; + case rocksdb::kLZ4HCCompression: + compression = "lz4hc"; + break; + } + + fprintf(stdout, "Compression : %s\n", compression); + + const char* memtablerep = ""; + switch (FLAGS_rep_factory) { + case kSkipList: + memtablerep = "skip_list"; + break; + case kHashSkipList: + memtablerep = "prefix_hash"; + break; + case kVectorRep: + memtablerep = "vector"; + break; + } + + fprintf(stdout, "Memtablerep : %s\n", memtablerep); + + fprintf(stdout, "------------------------------------------------\n"); + } + + void Open() { + assert(db_ == nullptr); + Options options; + options.block_cache = cache_; + options.block_cache_compressed = compressed_cache_; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options.max_background_compactions = FLAGS_max_background_compactions; + options.compaction_style = + static_cast(FLAGS_compaction_style); + options.block_size = FLAGS_block_size; + options.filter_policy = filter_policy_; + options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size)); + options.max_open_files = FLAGS_open_files; + options.statistics = dbstats; + options.env = FLAGS_env; + options.disableDataSync = FLAGS_disable_data_sync; + options.use_fsync = FLAGS_use_fsync; + options.allow_mmap_reads = FLAGS_mmap_read; + rocksdb_kill_odds = FLAGS_kill_random_test; + options.target_file_size_base = FLAGS_target_file_size_base; + options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options.max_bytes_for_level_multiplier = + FLAGS_max_bytes_for_level_multiplier; + options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options.compression = FLAGS_compression_type_e; + options.create_if_missing = true; + options.disable_seek_compaction = FLAGS_disable_seek_compaction; + options.delete_obsolete_files_period_micros = + FLAGS_delete_obsolete_files_period_micros; + options.max_manifest_file_size = 1024; + options.filter_deletes = FLAGS_filter_deletes; + if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { + fprintf(stderr, + "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); + exit(1); + } + switch (FLAGS_rep_factory) { + case kHashSkipList: + options.memtable_factory.reset(NewHashSkipListRepFactory()); + break; + case kSkipList: + // no need to do anything + break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory()); + break; + } + static Random purge_percent(1000); // no benefit from non-determinism here + if (static_cast(purge_percent.Uniform(100)) < + FLAGS_purge_redundant_percent - 1) { + options.purge_redundant_kvs_while_flush = false; + } + + if (FLAGS_use_merge) { + options.merge_operator = MergeOperators::CreatePutOperator(); + } + + // set universal style compaction configurations, if applicable + if (FLAGS_universal_size_ratio != 0) { + options.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; + } + if (FLAGS_universal_min_merge_width != 0) { + options.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; + } + if (FLAGS_universal_max_merge_width != 0) { + options.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; + } + if (FLAGS_universal_max_size_amplification_percent != 0) { + options.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; + } + + fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); + + Status s; + if (FLAGS_ttl == -1) { + s = DB::Open(options, FLAGS_db, &db_); + } else { + s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl); + db_ = sdb_; + } + if (!s.ok()) { + fprintf(stderr, "open error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + void Reopen() { + // do not close the db. Just delete the lock file. This + // simulates a crash-recovery kind of situation. + if (FLAGS_ttl != -1) { + ((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl(); + } else { + ((DBImpl*) db_)->TEST_Destroy_DBImpl(); + } + db_ = nullptr; + + num_times_reopened_++; + double now = FLAGS_env->NowMicros(); + fprintf(stdout, "%s Reopening database for the %dth time\n", + FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(), + num_times_reopened_); + Open(); + } + + void PrintStatistics() { + if (dbstats) { + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + } + } + + private: + shared_ptr cache_; + shared_ptr compressed_cache_; + const FilterPolicy* filter_policy_; + DB* db_; + StackableDB* sdb_; + int num_times_reopened_; +}; + +} // namespace rocksdb + + + +int main(int argc, char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_statistics) { + dbstats = rocksdb::CreateDBStatistics(); + } + FLAGS_compression_type_e = + StringToCompressionType(FLAGS_compression_type.c_str()); + if (!FLAGS_hdfs.empty()) { + FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs); + } + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); + + // The number of background threads should be at least as much the + // max number of concurrent compactions. + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + + if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) { + fprintf(stderr, + "Error: prefixpercent is non-zero while prefix_size is " + "not positive!\n"); + exit(1); + } + if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) { + fprintf(stderr, + "Error: please specify prefix_size for " + "test_batches_snapshots test!\n"); + exit(1); + } + if ((FLAGS_readpercent + FLAGS_prefixpercent + + FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) { + fprintf(stderr, + "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n"); + exit(1); + } + if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) { + fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n"); + exit(1); + } + if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) { + fprintf(stderr, + "Error: #DB-reopens should be < ops_per_thread\n" + "Provided reopens = %d and ops_per_thread = %lu\n", + FLAGS_reopen, + (unsigned long)FLAGS_ops_per_thread); + exit(1); + } + + // Choose a location for the test database if none given with --db= + if (FLAGS_db.empty()) { + std::string default_db_path; + rocksdb::Env::Default()->GetTestDirectory(&default_db_path); + default_db_path += "/dbstress"; + FLAGS_db = default_db_path; + } + + rocksdb::StressTest stress; + stress.Run(); + return 0; +} diff --git a/tools/ldb.cc b/tools/ldb.cc new file mode 100644 index 00000000..4581b801 --- /dev/null +++ b/tools/ldb.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "rocksdb/ldb_tool.h" + +int main(int argc, char** argv) { + rocksdb::LDBTool tool; + tool.Run(argc, argv); + return 0; +} diff --git a/tools/ldb_test.py b/tools/ldb_test.py new file mode 100644 index 00000000..b4ef5221 --- /dev/null +++ b/tools/ldb_test.py @@ -0,0 +1,383 @@ +import os +import os.path +import shutil +import subprocess +import time +import unittest +import tempfile + +def my_check_output(*popenargs, **kwargs): + """ + If we had python 2.7, we should simply use subprocess.check_output. + This is a stop-gap solution for python 2.6 + """ + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE, + *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise Exception("Exit code is not 0. It is %d. Command: %s" % + (retcode, cmd)) + return output + +def run_err_null(cmd): + return os.system(cmd + " 2>/dev/null ") + +class LDBTestCase(unittest.TestCase): + def setUp(self): + self.TMP_DIR = tempfile.mkdtemp(prefix="ldb_test_") + self.DB_NAME = "testdb" + + def tearDown(self): + assert(self.TMP_DIR.strip() != "/" + and self.TMP_DIR.strip() != "/tmp" + and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia + + shutil.rmtree(self.TMP_DIR) + + def dbParam(self, dbName): + return "--db=%s" % os.path.join(self.TMP_DIR, dbName) + + def assertRunOKFull(self, params, expectedOutput, unexpected=False): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + + """ + + output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" % + params, shell=True) + if not unexpected: + self.assertEqual(output.strip(), expectedOutput.strip()) + else: + self.assertNotEqual(output.strip(), expectedOutput.strip()) + + def assertRunFAILFull(self, params): + """ + All command-line params must be specified. + Allows full flexibility in testing; for example: missing db param. + + """ + try: + + my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \ + thread\"" % params, shell=True) + except Exception, e: + return + self.fail( + "Exception should have been raised for command with params: %s" % + params) + + def assertRunOK(self, params, expectedOutput, unexpected=False): + """ + Uses the default test db. + + """ + self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params), + expectedOutput, unexpected) + + def assertRunFAIL(self, params): + """ + Uses the default test db. + """ + self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params)) + + def testSimpleStringPutGet(self): + print "Running testSimpleStringPutGet..." + self.assertRunFAIL("put x1 y1") + self.assertRunOK("put --create_if_missing x1 y1", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunFAIL("get x2") + + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("get x2", "y2") + self.assertRunFAIL("get x3") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2") + self.assertRunOK("put x3 y3", "OK") + + self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("scan --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=2", + "x1 : y1\nx2 : y2") + + self.assertRunOK("scan --from=x1 --to=z --max_keys=3", + "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=z --max_keys=4", + "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1") + self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3") + self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL + self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo") + + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + + self.assertRunOK("delete x1", "OK") + self.assertRunOK("scan", "x2 : y2\nx3 : y3") + + self.assertRunOK("delete NonExistentKey", "OK") + # It is weird that GET and SCAN raise exception for + # non-existent key, while delete does not + + self.assertRunOK("checkconsistency", "OK") + + def dumpDb(self, params, dumpFile): + return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile)) + + def loadDb(self, params, dumpFile): + return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params)) + + def testStringBatchPut(self): + print "Running testStringBatchPut..." + self.assertRunOK("batchput x1 y1 --create_if_missing", "OK") + self.assertRunOK("scan", "x1 : y1") + self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + self.assertRunFAIL("batchput") + self.assertRunFAIL("batchput k1") + self.assertRunFAIL("batchput k1 v1 k2") + + def testCountDelimDump(self): + print "Running testCountDelimDump..." + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8") + + def testCountDelimIDump(self): + print "Running testCountDelimIDump..." + self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK") + self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK") + self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8") + self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK") + self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8") + + def testInvalidCmdLines(self): + print "Running testInvalidCmdLines..." + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + + def testHexPutGet(self): + print "Running testHexPutGet..." + self.assertRunOK("put a1 b1 --create_if_missing", "OK") + self.assertRunOK("scan", "a1 : b1") + self.assertRunOK("scan --hex", "0x6131 : 0x6231") + self.assertRunFAIL("put --hex 6132 6232") + self.assertRunOK("put --hex 0x6132 0x6232", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan", "a1 : b1\na2 : b2") + self.assertRunOK("get a1", "b1") + self.assertRunOK("get --hex 0x6131", "0x6231") + self.assertRunOK("get a2", "b2") + self.assertRunOK("get --hex 0x6132", "0x6232") + self.assertRunOK("get --key_hex 0x6132", "b2") + self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232") + self.assertRunOK("get --value_hex a2", "0x6232") + self.assertRunOK("scan --key_hex --value_hex", + "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6133", + "0x6131 : 0x6231\n0x6132 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", + "0x6131 : 0x6231") + self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2") + self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232") + self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4") + self.assertRunOK("delete --hex 0x6133", "OK") + self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4") + self.assertRunOK("checkconsistency", "OK") + + def testTtlPutGet(self): + print "Running testTtlPutGet..." + self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK") + self.assertRunOK("scan --hex", "0x6131 : 0x6231", True) + self.assertRunOK("dump --ttl ", "a1 ==> b1", True) + self.assertRunOK("dump --hex --ttl ", + "0x6131 ==> 0x6231\nKeys in range: 1") + self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231") + self.assertRunOK("get --value_hex a1", "0x6231", True) + self.assertRunOK("get --ttl a1", "b1") + self.assertRunOK("put a3 b3 --create_if_missing", "OK") + # fails because timstamp's length is greater than value's + self.assertRunFAIL("get --ttl a3") + self.assertRunOK("checkconsistency", "OK") + + def testInvalidCmdLines(self): + print "Running testInvalidCmdLines..." + # db not specified + self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing") + # No param called he + self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing") + # max_keys is not applicable for put + self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing") + # hex has invalid boolean value + self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing") + + def testDumpLoad(self): + print "Running testDumpLoad..." + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", + "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + # Dump and load without any additional params specified + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load in hex + dumpFilePath = os.path.join(self.TMP_DIR, "dump2") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2") + self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump only a portion of the key range + dumpFilePath = os.path.join(self.TMP_DIR, "dump3") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3") + self.assertTrue(self.dumpDb( + "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2") + + # Dump upto max_keys rows + dumpFilePath = os.path.join(self.TMP_DIR, "dump4") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4") + self.assertTrue(self.dumpDb( + "--db=%s --max_keys=3" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3") + + # Load into an existing db, create_if_missing is not specified + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load with WAL disabled + dumpFilePath = os.path.join(self.TMP_DIR, "dump5") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5") + self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --disable_wal --create_if_missing" % loadedDbPath, + dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump and load with lots of extra params specified + extraParams = " ".join(["--bloom_bits=14", "--compression_type=bzip2", + "--block_size=1024", "--auto_compaction=true", + "--write_buffer_size=4194304", + "--file_size=2097152"]) + dumpFilePath = os.path.join(self.TMP_DIR, "dump6") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6") + self.assertTrue(self.dumpDb( + "--db=%s %s" % (origDbPath, extraParams), dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams), + dumpFilePath)) + self.assertRunOKFull("scan --db=%s" % loadedDbPath, + "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + # Dump with count_only + dumpFilePath = os.path.join(self.TMP_DIR, "dump7") + loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7") + self.assertTrue(self.dumpDb( + "--db=%s --count_only" % origDbPath, dumpFilePath)) + self.assertTrue(self.loadDb( + "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)) + # DB should have atleast one value for scan to work + self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK") + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1") + + # Dump command fails because of typo in params + dumpFilePath = os.path.join(self.TMP_DIR, "dump8") + self.assertFalse(self.dumpDb( + "--db=%s --create_if_missing" % origDbPath, dumpFilePath)) + + def testMiscAdminTask(self): + print "Running testMiscAdminTask..." + # These tests need to be improved; for example with asserts about + # whether compaction or level reduction actually took place. + self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", + "OK") + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + self.assertTrue(0 == run_err_null( + "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134" + % origDbPath)) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted. + self.assertTrue(0 == run_err_null( + "./ldb dump_wal --db=%s --walfile=%s --header" % ( + origDbPath, os.path.join(origDbPath, "LOG")))) + self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + + def testCheckConsistency(self): + print "Running testCheckConsistency..." + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2", "OK") + self.assertRunOK("get x1", "y1") + self.assertRunOK("checkconsistency", "OK") + + sstFilePath = my_check_output("ls %s" % os.path.join(dbPath, "*.sst"), + shell=True) + + # Modify the file + my_check_output("echo 'evil' > %s" % sstFilePath, shell=True) + self.assertRunFAIL("checkconsistency") + + # Delete the file + my_check_output("rm -f %s" % sstFilePath, shell=True) + self.assertRunFAIL("checkconsistency") + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc new file mode 100644 index 00000000..b588b52d --- /dev/null +++ b/tools/reduce_levels_test.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/db.h" +#include "db/db_impl.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testutil.h" +#include "util/testharness.h" +#include "util/ldb_cmd.h" + +namespace rocksdb { + +class ReduceLevelTest { +public: + ReduceLevelTest() { + dbname_ = test::TmpDir() + "/db_reduce_levels_test"; + DestroyDB(dbname_, Options()); + db_ = nullptr; + } + + Status OpenDB(bool create_if_missing, int levels, + int mem_table_compact_level); + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + std::string Get(const std::string& k) { + ReadOptions options; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + Status CompactMemTable() { + if (db_ == nullptr) { + return Status::InvalidArgument("DB not opened."); + } + DBImpl* db_impl = reinterpret_cast(db_); + return db_impl->TEST_FlushMemTable(); + } + + void CloseDB() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + bool ReduceLevels(int target_level); + + int FilesOnLevel(int level) { + std::string property; + ASSERT_TRUE( + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &property)); + return atoi(property.c_str()); + } + +private: + std::string dbname_; + DB* db_; +}; + +Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels, + int mem_table_compact_level) { + rocksdb::Options opt; + opt.num_levels = num_levels; + opt.create_if_missing = create_if_missing; + opt.max_mem_compaction_level = mem_table_compact_level; + rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_); + if (!st.ok()) { + fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); + } + return st; +} + +bool ReduceLevelTest::ReduceLevels(int target_level) { + std::vector args = rocksdb::ReduceDBLevelsCommand::PrepareArgs( + dbname_, target_level, false); + LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(args); + level_reducer->Run(); + bool is_succeed = level_reducer->GetExecuteState().IsSucceed(); + delete level_reducer; + return is_succeed; +} + +TEST(ReduceLevelTest, Last_Level) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 4, 3)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 1)); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 1)); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); +} + +TEST(ReduceLevelTest, Top_Level) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 5, 0)); + ASSERT_OK(Put("aaaa", "11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(0), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4, 0)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 0)); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 0)); + CloseDB(); +} + +TEST(ReduceLevelTest, All_Levels) { + // create files on all levels; + ASSERT_OK(OpenDB(true, 5, 1)); + ASSERT_OK(Put("a", "a11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 2)); + ASSERT_OK(Put("b", "b11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 3)); + ASSERT_OK(Put("c", "c11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + CloseDB(); + + ASSERT_OK(OpenDB(true, 5, 4)); + ASSERT_OK(Put("d", "d11111")); + ASSERT_OK(CompactMemTable()); + ASSERT_EQ(FilesOnLevel(1), 1); + ASSERT_EQ(FilesOnLevel(2), 1); + ASSERT_EQ(FilesOnLevel(3), 1); + ASSERT_EQ(FilesOnLevel(4), 1); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(4)); + ASSERT_OK(OpenDB(true, 4, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(3)); + ASSERT_OK(OpenDB(true, 3, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); + + ASSERT_TRUE(ReduceLevels(2)); + ASSERT_OK(OpenDB(true, 2, 0)); + ASSERT_EQ("a11111", Get("a")); + ASSERT_EQ("b11111", Get("b")); + ASSERT_EQ("c11111", Get("c")); + ASSERT_EQ("d11111", Get("d")); + CloseDB(); +} + +} + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/tools/shell/DBClientProxy.cpp b/tools/shell/DBClientProxy.cpp new file mode 100644 index 00000000..93277ac1 --- /dev/null +++ b/tools/shell/DBClientProxy.cpp @@ -0,0 +1,271 @@ + +#include + +#include "DBClientProxy.h" + + +#include "thrift/lib/cpp/protocol/TBinaryProtocol.h" +#include "thrift/lib/cpp/transport/TSocket.h" +#include "thrift/lib/cpp/transport/TTransportUtils.h" + + + +using namespace std; +using namespace boost; +using namespace Tleveldb; +using namespace apache::thrift::protocol; +using namespace apache::thrift::transport; + +namespace rocksdb { + +DBClientProxy::DBClientProxy(const string & host, int port) : + host_(host), + port_(port), + dbToHandle_(), + dbClient_() { +} + +DBClientProxy::~DBClientProxy() { + cleanUp(); +} + + +void DBClientProxy::connect(void) { + cleanUp(); + printf("Connecting to %s:%d\n", host_.c_str(), port_); + try { + boost::shared_ptr socket(new TSocket(host_, port_)); + boost::shared_ptr transport(new TBufferedTransport(socket)); + boost::shared_ptr protocol(new TBinaryProtocol(transport)); + dbClient_.reset(new DBClient(protocol)); + + transport->open(); + } catch (const std::exception & e) { + dbClient_.reset(); + throw; + } +} + +void DBClientProxy::cleanUp(void) { + if(dbClient_.get()) { + for(map::iterator itor = dbToHandle_.begin(); + itor != dbToHandle_.end(); + ++itor) { + dbClient_->Close(itor->second, itor->first); + } + dbClient_.reset(); + } + dbToHandle_.clear(); +} + +void DBClientProxy::open(const string & db) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return; + } + + // printf("opening database : %s\n", db.c_str()); + // we use default DBOptions here + DBOptions opt; + DBHandle handle; + try { + dbClient_->Open(handle, db, opt); + } catch (const LeveldbException & e) { + printf("%s\n", e.message.c_str()); + if(kIOError == e.errorCode) { + printf("no such database : %s\n", db.c_str()); + return; + }else { + printf("Unknown error : %d\n", e.errorCode); + return; + } + } + + dbToHandle_[db] = handle; +} + + +bool DBClientProxy::create(const string & db) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + printf("creating database : %s\n", db.c_str()); + DBOptions opt; + opt.create_if_missing = true; + opt.error_if_exists = true; + DBHandle handle; + try { + dbClient_->Open(handle, db, opt); + }catch (const LeveldbException & e) { + printf("%s\n", e.message.c_str()); + printf("error code : %d\n", e.errorCode); + if(kNotFound == e.errorCode) { + printf("no such database : %s\n", db.c_str()); + return false;; + } else { + printf("Unknown error : %d\n", e.errorCode); + return false; + } + } + + dbToHandle_[db] = handle; + return true; +} + + +map::iterator +DBClientProxy::getHandle(const string & db) { + map::iterator itor = dbToHandle_.find(db); + if(dbToHandle_.end() == itor) { + open(db); + itor = dbToHandle_.find(db); + } + + return itor; +} + + +bool DBClientProxy::get(const string & db, + const string & key, + string & value) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + map::iterator itor = getHandle(db); + if(dbToHandle_.end() == itor) { + return false; + } + + ResultItem ret; + Slice k; + k.data = key; + k.size = key.size(); + // we use default values of options here + ReadOptions opt; + dbClient_->Get(ret, + itor->second, + k, + opt); + if(kOk == ret.status) { + value = ret.value.data; + return true; + } else if(kNotFound == ret.status) { + printf("no such key : %s\n", key.c_str()); + return false; + } else { + printf("get data error : %d\n", ret.status); + return false; + } +} + + + +bool DBClientProxy::put(const string & db, + const string & key, + const string & value) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + map::iterator itor = getHandle(db); + if(dbToHandle_.end() == itor) { + return false; + } + + kv temp; + temp.key.data = key; + temp.key.size = key.size(); + temp.value.data = value; + temp.value.size = value.size(); + WriteOptions opt; + opt.sync = true; + Code code; + code = dbClient_->Put(itor->second, + temp, + opt); + + + if(kOk == code) { + // printf("set value finished\n"); + return true; + } else { + printf("put data error : %d\n", code); + return false; + } +} + +bool DBClientProxy::scan(const string & db, + const string & start_key, + const string & end_key, + const string & limit, + vector > & kvs) { + if(!dbClient_.get()) { + printf("please connect() first\n"); + return false; + } + + int limitInt = -1; + limitInt = atoi(limit.c_str()); + if(limitInt <= 0) { + printf("Error while parse limit : %s\n", limit.c_str()); + return false; + } + + if(start_key > end_key) { + printf("empty range.\n"); + return false; + } + + map::iterator itor = getHandle(db); + if(dbToHandle_.end() == itor) { + return false; + } + + ResultIterator ret; + // we use the default values of options here + ReadOptions opt; + Slice k; + k.data = start_key; + k.size = start_key.size(); + dbClient_->NewIterator(ret, + itor->second, + opt, + seekToKey, + k); + Iterator it; + if(kOk == ret.status) { + it = ret.iterator; + } else { + printf("get iterator error : %d\n", ret.status); + return false; + } + + int idx = 0; + string ck = start_key; + while(idx < limitInt && ck < end_key) { + ResultPair retPair; + dbClient_->GetNext(retPair, itor->second, it); + if(kOk == retPair.status) { + ++idx; + ck = retPair.keyvalue.key.data; + if (ck < end_key) { + kvs.push_back(make_pair(retPair.keyvalue.key.data, + retPair.keyvalue.value.data)); + } + } else if(kEnd == retPair.status) { + printf("not enough values\n"); + return true; + } else { + printf("GetNext() error : %d\n", retPair.status); + return false; + } + } + return true; +} + +} // namespace diff --git a/tools/shell/DBClientProxy.h b/tools/shell/DBClientProxy.h new file mode 100644 index 00000000..fba228b9 --- /dev/null +++ b/tools/shell/DBClientProxy.h @@ -0,0 +1,64 @@ + +#ifndef TOOLS_SHELL_DBCLIENTPROXY +#define TOOLS_SHELL_DBCLIENTPROXY + +#include +#include +#include +#include +#include + +#include "DB.h" + +/* + * class DBClientProxy maintains: + * 1. a connection to rocksdb service + * 2. a map from db names to opened db handles + * + * it's client codes' responsibility to catch all possible exceptions. + */ + +namespace rocksdb { + +class DBClientProxy : private boost::noncopyable { + public: + // connect to host_:port_ + void connect(void); + + // return true on success, false otherwise + bool get(const std::string & db, + const std::string & key, + std::string & value); + + // return true on success, false otherwise + bool put(const std::string & db, + const std::string & key, + const std::string & value); + + // return true on success, false otherwise + bool scan(const std::string & db, + const std::string & start_key, + const std::string & end_key, + const std::string & limit, + std::vector > & kvs); + + // return true on success, false otherwise + bool create(const std::string & db); + + DBClientProxy(const std::string & host, int port); + ~DBClientProxy(); + + private: + // some internal help functions + void cleanUp(void); + void open(const std::string & db); + std::map::iterator getHandle(const std::string & db); + + const std::string host_; + const int port_; + std::map dbToHandle_; + boost::shared_ptr dbClient_; +}; + +} // namespace +#endif diff --git a/tools/shell/LeveldbShell.cpp b/tools/shell/LeveldbShell.cpp new file mode 100644 index 00000000..e6274d3b --- /dev/null +++ b/tools/shell/LeveldbShell.cpp @@ -0,0 +1,8 @@ + + +#include "ShellContext.h" + +int main(int argc, char ** argv) { + ShellContext c(argc, argv); + c.run(); +} diff --git a/tools/shell/ShellContext.cpp b/tools/shell/ShellContext.cpp new file mode 100644 index 00000000..05a9bb81 --- /dev/null +++ b/tools/shell/ShellContext.cpp @@ -0,0 +1,104 @@ + +#include +#include + +#include "ShellContext.h" +#include "ShellState.h" + + + +#include "thrift/lib/cpp/protocol/TBinaryProtocol.h" +#include "thrift/lib/cpp/transport/TSocket.h" +#include "thrift/lib/cpp/transport/TTransportUtils.h" + + + +using namespace std; +using namespace boost; +using namespace Tleveldb; +using namespace rocksdb; +using namespace apache::thrift::protocol; +using namespace apache::thrift::transport; + +void ShellContext::changeState(ShellState * pState) { + pShellState_ = pState; +} + +void ShellContext::stop(void) { + exit_ = true; +} + +bool ShellContext::ParseInput(void) { + if(argc_ != 3) { + printf("leveldb_shell host port\n"); + return false; + } + + port_ = atoi(argv_[2]); + if(port_ <= 0) { + printf("Error while parse port : %s\n", argv_[2]); + return false; + } + + clientProxy_.reset(new DBClientProxy(argv_[1], port_)); + if(!clientProxy_.get()) { + return false; + } else { + return true; + } +} + +void ShellContext::connect(void) { + clientProxy_->connect(); +} + +void ShellContext::create(const string & db) { + if (clientProxy_->create(db)) { + printf("%s created\n", db.c_str()); + } +} + +void ShellContext::get(const string & db, + const string & key) { + string v; + if (clientProxy_->get(db, key, v)) { + printf("%s\n", v.c_str()); + } +} + +void ShellContext::put(const string & db, + const string & key, + const string & value) { + if (clientProxy_->put(db, key, value)) { + printf("(%s, %s) has been set\n", key.c_str(), value.c_str()); + } +} + +void ShellContext::scan(const string & db, + const string & start_key, + const string & end_key, + const string & limit) { + vector > kvs; + if (clientProxy_->scan(db, start_key, end_key, limit, kvs)) { + for(unsigned int i = 0; i < kvs.size(); ++i) { + printf("%d (%s, %s)\n", i, kvs[i].first.c_str(), kvs[i].second.c_str()); + } + } +} + +void ShellContext::run(void) { + while(!exit_) { + pShellState_->run(this); + } +} + +ShellContext::ShellContext(int argc, char ** argv) : + pShellState_(ShellStateStart::getInstance()), + exit_(false), + argc_(argc), + argv_(argv), + port_(-1), + clientProxy_() { +} + + diff --git a/tools/shell/ShellContext.h b/tools/shell/ShellContext.h new file mode 100644 index 00000000..5c2b9448 --- /dev/null +++ b/tools/shell/ShellContext.h @@ -0,0 +1,51 @@ +#ifndef TOOLS_SHELL_SHELLCONTEXT +#define TOOLS_SHELL_SHELLCONTEXT + +#include +#include +#include +#include + +#include "DB.h" +#include "DBClientProxy.h" + +class ShellState; + +class ShellContext : private boost::noncopyable { + public: + void changeState(ShellState * pState); + + void stop(void); + + bool ParseInput(void); + + void connect(void); + + void get(const std::string & db, + const std::string & key); + + void put(const std::string & db, + const std::string & key, + const std::string & value); + + void scan(const std::string & db, + const std::string & start_key, + const std::string & end_key, + const std::string & limit); + + void create(const std::string & db); + + void run(void); + + ShellContext(int argc, char ** argv); + + private: + ShellState * pShellState_; + bool exit_; + int argc_; + char ** argv_; + int port_; + boost::shared_ptr clientProxy_; +}; + +#endif diff --git a/tools/shell/ShellState.cpp b/tools/shell/ShellState.cpp new file mode 100644 index 00000000..057a337a --- /dev/null +++ b/tools/shell/ShellState.cpp @@ -0,0 +1,139 @@ +#include +#include +#include +#include + +#include "ShellState.h" +#include "ShellContext.h" +#include "transport/TTransportException.h" + +using namespace std; + +using namespace apache::thrift::transport; + +const char * PMT = ">> "; + + +void ShellStateStart::run(ShellContext * c) { + if(!c->ParseInput()) { + c->changeState(ShellStateStop::getInstance()); + } else { + c->changeState(ShellStateConnecting::getInstance()); + } +} + + +void ShellStateStop::run(ShellContext * c) { + c->stop(); +} + +void ShellStateConnecting::run(ShellContext * c) { + try { + c->connect(); + } catch (const TTransportException & e) { + cout << e.what() << endl; + c->changeState(ShellStateStop::getInstance()); + return; + } + + c->changeState(ShellStateConnected::getInstance()); +} + +void ShellStateConnected::unknownCmd(void) { + cout << "Unknown command!" << endl; + cout << "Use help to list all available commands" << endl; +} + +void ShellStateConnected::helpMsg(void) { + cout << "Currently supported commands:" << endl; + cout << "create db" << endl; + cout << "get db key" << endl; + cout << "scan db start_key end_key limit" << endl; + cout << "put db key value" << endl; + cout << "exit/quit" << endl; +} + +void ShellStateConnected::handleConError(ShellContext * c) { + cout << "Connection down" << endl; + cout << "Reconnect ? (y/n) :" << endl; + string s; + while(getline(cin, s)) { + if("y" == s) { + c->changeState(ShellStateConnecting::getInstance()); + break; + } else if("n" == s) { + c->changeState(ShellStateStop::getInstance()); + break; + } else { + cout << "Reconnect ? (y/n) :" << endl; + } + } +} + +void ShellStateConnected::run(ShellContext * c) { + string line; + cout << PMT; + getline(cin, line); + istringstream is(line); + vector params; + string param; + while(is >> param) { + params.push_back(param); + } + + // empty input line + if(params.empty()) + return; + + if("quit" == params[0] || "exit" == params[0]) { + c->changeState(ShellStateStop::getInstance()); + } else if("get" == params[0]) { + if(params.size() == 3) { + try { + c->get(params[1], params[2]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + } else if("create" == params[0]) { + if(params.size() == 2) { + try { + c->create(params[1]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + }else if("put" == params[0]) { + if(params.size() == 4) { + try { + c->put(params[1], params[2], params[3]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + } else if("scan" == params[0]) { + if(params.size() == 5) { + try { + c->scan(params[1], params[2], params[3], params[4]); + } catch (const TTransportException & e) { + cout << e.what() << endl; + handleConError(c); + } + } else { + unknownCmd(); + } + } else if("help" == params[0]) { + helpMsg(); + } else { + unknownCmd(); + } +} diff --git a/tools/shell/ShellState.h b/tools/shell/ShellState.h new file mode 100644 index 00000000..4027af20 --- /dev/null +++ b/tools/shell/ShellState.h @@ -0,0 +1,87 @@ + +#ifndef TOOLS_SHELL_SHELLSTATE +#define TOOLS_SHELL_SHELLSTATE + +class ShellContext; + +/* + * Currently, there are four types of state in total + * 1. start state: the first state the program enters + * 2. connecting state: the program try to connect to a rocksdb server, whose + * previous states could be "start" or "connected" states + * 3. connected states: the program has already connected to a server, and is + * processing user commands + * 4. stop state: the last state the program enters, do some cleaning up things + */ + +class ShellState { + public: + virtual void run(ShellContext *) = 0; + virtual ~ShellState() {} +}; + + +class ShellStateStart : public ShellState { + public: + static ShellStateStart * getInstance(void) { + static ShellStateStart instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateStart() {} + virtual ~ShellStateStart() {} +}; + +class ShellStateStop : public ShellState { + public: + static ShellStateStop * getInstance(void) { + static ShellStateStop instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateStop() {} + virtual ~ShellStateStop() {} + +}; + +class ShellStateConnecting : public ShellState { + public: + static ShellStateConnecting * getInstance(void) { + static ShellStateConnecting instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateConnecting() {} + virtual ~ShellStateConnecting() {} + +}; + +class ShellStateConnected : public ShellState { + public: + static ShellStateConnected * getInstance(void) { + static ShellStateConnected instance; + return &instance; + } + + virtual void run(ShellContext *); + + private: + ShellStateConnected() {} + virtual ~ShellStateConnected() {} + + void unknownCmd(); + void handleConError(ShellContext *); + void helpMsg(); +}; + +#endif + diff --git a/tools/shell/test/DBClientProxyTest.cpp b/tools/shell/test/DBClientProxyTest.cpp new file mode 100644 index 00000000..3b64ffc5 --- /dev/null +++ b/tools/shell/test/DBClientProxyTest.cpp @@ -0,0 +1,182 @@ +/** + * Tests for DBClientProxy class for leveldb + * @author Bo Liu (newpoo.liu@gmail.com) + * Copyright 2012 Facebook + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "server_options.h" + + +#include "../DBClientProxy.h" +using namespace rocksdb; + + +using namespace apache::thrift; +using namespace apache::thrift::protocol; +using namespace apache::thrift::transport; +using boost::shared_ptr; +using namespace Tleveldb; +using namespace std; + + + +extern "C" void startServer(int argc, char**argv); +extern "C" void stopServer(int port); +extern ServerOptions server_options; + +static const string db1("db1"); + + +static void testDBClientProxy(DBClientProxy & dbcp) { + bool flag; + const int NOK = 100; + const int BUFSIZE = 16; + int testcase = 0; + + vector keys, values; + vector > kvs, correctKvs; + string k, v; + + for(int i = 0; i < NOK; ++i) { + char bufKey[BUFSIZE]; + char bufValue[BUFSIZE]; + snprintf(bufKey, BUFSIZE, "key%d", i); + snprintf(bufValue, BUFSIZE, "value%d", i); + keys.push_back(bufKey); + values.push_back(bufValue); + correctKvs.push_back((make_pair(string(bufKey), string(bufValue)))); + } + + sort(correctKvs.begin(), correctKvs.end()); + + + // can not do get(), put(), scan() or create() before connected. + flag = dbcp.get(db1, keys[0], v); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + flag = dbcp.put(db1, keys[0], keys[1]); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + flag = dbcp.scan(db1, "a", "w", "100", kvs); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + flag = dbcp.create(db1); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + dbcp.connect(); + + // create a database + flag = dbcp.create(db1); + ASSERT_TRUE(true == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + // no such key + flag = dbcp.get(db1, keys[0], v); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // scan() success with empty returned key-value pairs + kvs.clear(); + flag = dbcp.scan(db1, "a", "w", "100", kvs); + ASSERT_TRUE(true == flag); + ASSERT_TRUE(kvs.empty()); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // put() + for(int i = 0; i < NOK; ++i) { + flag = dbcp.put(db1, keys[i], values[i]); + ASSERT_TRUE(true == flag); + } + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // scan all of key-value pairs + kvs.clear(); + flag = dbcp.scan(db1, "a", "w", "100", kvs); + ASSERT_TRUE(true == flag); + ASSERT_TRUE(kvs == correctKvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + + // scan the first 20 key-value pairs + { + kvs.clear(); + flag = dbcp.scan(db1, "a", "w", "20", kvs); + ASSERT_TRUE(true == flag); + vector > tkvs(correctKvs.begin(), correctKvs.begin() + 20); + ASSERT_TRUE(kvs == tkvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + } + + // scan key[10] to key[50] + { + kvs.clear(); + flag = dbcp.scan(db1, correctKvs[10].first, correctKvs[50].first, "100", kvs); + ASSERT_TRUE(true == flag); + + vector > tkvs(correctKvs.begin() + 10, correctKvs.begin() + 50); + ASSERT_TRUE(kvs == tkvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + } + + // scan "key10" to "key40" by limit constraint + { + kvs.clear(); + flag = dbcp.scan(db1, correctKvs[10].first.c_str(), "w", "30", kvs); + ASSERT_TRUE(true == flag); + vector > tkvs(correctKvs.begin() + 10, correctKvs.begin() + 40); + ASSERT_TRUE(kvs == tkvs); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + } + + + // get() + flag = dbcp.get(db1, "unknownKey", v); + ASSERT_TRUE(false == flag); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); + + flag = dbcp.get(db1, keys[0], v); + ASSERT_TRUE(true == flag); + ASSERT_TRUE(v == values[0]); + printf("\033[01;40;32mTEST CASE %d passed\033[01;40;37m\n", ++testcase); +} + + + +static void cleanupDir(std::string dir) { + // remove old data, if any + char* cleanup = new char[100]; + snprintf(cleanup, 100, "rm -rf %s", dir.c_str()); + system(cleanup); +} + +int main(int argc, char **argv) { + // create a server + startServer(argc, argv); + printf("Server thread created.\n"); + + // give some time to the server to initialize itself + while (server_options.getPort() == 0) { + sleep(1); + } + + cleanupDir(server_options.getDataDirectory(db1)); + + DBClientProxy dbcp("localhost", server_options.getPort()); + testDBClientProxy(dbcp); +} + diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc new file mode 100644 index 00000000..b34b7fa8 --- /dev/null +++ b/tools/sst_dump.cc @@ -0,0 +1,359 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/meta_blocks.h" +#include "table/format.h" +#include "util/ldb_cmd.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class SstFileReader { + public: + explicit SstFileReader(const std::string& file_name, + bool verify_checksum, + bool output_hex); + + Status ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key); + + Status ReadTableProperties( + std::shared_ptr* table_properties); + uint64_t GetReadNumber() { return read_num_; } + + private: + Status NewTableReader(const std::string& file_path); + Status SetTableOptionsByMagicNumber(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size); + + std::string file_name_; + uint64_t read_num_; + bool verify_checksum_; + bool output_hex_; + EnvOptions soptions_; + + Status init_result_; + unique_ptr table_reader_; + unique_ptr file_; + // options_ and internal_comparator_ will also be used in + // ReadSequential internally (specifically, seek-related operations) + Options options_; + InternalKeyComparator internal_comparator_; +}; + +SstFileReader::SstFileReader(const std::string& file_path, + bool verify_checksum, + bool output_hex) + :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), + output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { + fprintf(stdout, "Process %s\n", file_path.c_str()); + + init_result_ = NewTableReader(file_name_); +} + +extern uint64_t kBlockBasedTableMagicNumber; +extern uint64_t kPlainTableMagicNumber; + +Status SstFileReader::NewTableReader(const std::string& file_path) { + uint64_t magic_number; + Status s = + ReadTableMagicNumber(file_path, options_, soptions_, &magic_number); + if (!s.ok()) { + return s; + } + if (magic_number == kPlainTableMagicNumber) { + soptions_.use_mmap_reads = true; + } + options_.comparator = &internal_comparator_; + + s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + if (!s.ok()) { + return s; + } + uint64_t file_size; + options_.env->GetFileSize(file_path, &file_size); + s = SetTableOptionsByMagicNumber(magic_number, file_.get(), file_size); + if (!s.ok()) { + return s; + } + + s = options_.table_factory->NewTableReader( + options_, soptions_, internal_comparator_, std::move(file_), file_size, + &table_reader_); + return s; +} + +Status SstFileReader::SetTableOptionsByMagicNumber(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size) { + TableProperties* table_properties; + Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, + options_.env, options_.info_log.get(), + &table_properties); + std::unique_ptr props_guard(table_properties); + if (!s.ok()) { + return s; + } + + if (table_magic_number == kBlockBasedTableMagicNumber) { + options_.table_factory = std::make_shared(); + fprintf(stdout, "Sst file format: block-based\n"); + } else if (table_magic_number == kPlainTableMagicNumber) { + options_.allow_mmap_reads = true; + options_.table_factory = std::make_shared( + table_properties->fixed_key_len, 2, 0.8); + options_.prefix_extractor.reset(NewNoopTransform()); + fprintf(stdout, "Sst file format: plain table\n"); + } else { + char error_msg_buffer[80]; + snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, + "Unsupported table magic number --- %lx", + (long)table_magic_number); + return Status::InvalidArgument(error_msg_buffer); + } + + return Status::OK(); +} + +Status SstFileReader::ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key) { + if (!table_reader_) { + return init_result_; + } + + Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_, + false)); + uint64_t i = 0; + if (has_from) { + InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Slice value = iter->value(); + ++i; + if (read_num > 0 && i > read_num) + break; + + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + std::cerr << "Internal Key [" + << key.ToString(true /* in hex*/) + << "] parse error!\n"; + continue; + } + + // If end marker was specified, we stop before it + if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + break; + } + + if (print_kv) { + fprintf(stdout, "%s => %s\n", + ikey.DebugString(output_hex_).c_str(), + value.ToString(output_hex_).c_str()); + } + } + + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} + +Status SstFileReader::ReadTableProperties( + std::shared_ptr* table_properties) { + if (!table_reader_) { + return init_result_; + } + + *table_properties = table_reader_->GetTableProperties(); + return init_result_; +} + +} // namespace rocksdb + +static void print_help() { + fprintf(stderr, + "sst_dump [--command=check|scan] [--verify_checksum] " + "--file=data_dir_OR_sst_file" + " [--output_hex]" + " [--input_key_hex]" + " [--from=]" + " [--to=]" + " [--read_num=NUM]" + " [--show_properties]\n"); +} + +string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; +} + +int main(int argc, char** argv) { + const char* dir_or_file = nullptr; + uint64_t read_num = -1; + std::string command; + + char junk; + uint64_t n; + bool verify_checksum = false; + bool output_hex = false; + bool input_key_hex = false; + bool has_from = false; + bool has_to = false; + bool show_properties = false; + std::string from_key; + std::string to_key; + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "--file=", 7) == 0) { + dir_or_file = argv[i] + 7; + } else if (strcmp(argv[i], "--output_hex") == 0) { + output_hex = true; + } else if (strcmp(argv[i], "--input_key_hex") == 0) { + input_key_hex = true; + } else if (sscanf(argv[i], + "--read_num=%lu%c", + (unsigned long*)&n, &junk) == 1) { + read_num = n; + } else if (strcmp(argv[i], "--verify_checksum") == 0) { + verify_checksum = true; + } else if (strncmp(argv[i], "--command=", 10) == 0) { + command = argv[i] + 10; + } else if (strncmp(argv[i], "--from=", 7) == 0) { + from_key = argv[i] + 7; + has_from = true; + } else if (strncmp(argv[i], "--to=", 5) == 0) { + to_key = argv[i] + 5; + has_to = true; + } else if (strcmp(argv[i], "--show_properties") == 0) { + show_properties = true; + } else { + print_help(); + exit(1); + } + } + + + if (input_key_hex) { + if (has_from) { + from_key = HexToString(from_key); + } + if (has_to) { + to_key = HexToString(to_key); + } + } + + if (dir_or_file == nullptr) { + print_help(); + exit(1); + } + + std::vector filenames; + rocksdb::Env* env = rocksdb::Env::Default(); + rocksdb::Status st = env->GetChildren(dir_or_file, &filenames); + bool dir = true; + if (!st.ok()) { + filenames.clear(); + filenames.push_back(dir_or_file); + dir = false; + } + + fprintf(stdout, "from [%s] to [%s]\n", + rocksdb::Slice(from_key).ToString(true).c_str(), + rocksdb::Slice(to_key).ToString(true).c_str()); + + uint64_t total_read = 0; + for (size_t i = 0; i < filenames.size(); i++) { + std::string filename = filenames.at(i); + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + // ignore + continue; + } + if (dir) { + filename = std::string(dir_or_file) + "/" + filename; + } + rocksdb::SstFileReader reader(filename, verify_checksum, + output_hex); + rocksdb::Status st; + // scan all files in give file path. + if (command == "" || command == "scan" || command == "check") { + st = reader.ReadSequential(command != "check", + read_num > 0 ? (read_num - total_read) : + read_num, + has_from, from_key, has_to, to_key); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), + st.ToString().c_str()); + } + total_read += reader.GetReadNumber(); + if (read_num > 0 && total_read > read_num) { + break; + } + } + if (show_properties) { + std::shared_ptr table_properties; + st = reader.ReadTableProperties(&table_properties); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + } else { + fprintf(stdout, + "Table Properties:\n" + "------------------------------\n" + " %s", + table_properties->ToString("\n ", ": ").c_str()); + fprintf(stdout, "# deleted keys: %zd\n", + rocksdb::GetDeletedKeys( + table_properties->user_collected_properties)); + } + } + } +} diff --git a/util/arena.cc b/util/arena.cc new file mode 100644 index 00000000..9b2cb82d --- /dev/null +++ b/util/arena.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include + +namespace rocksdb { + +const size_t Arena::kMinBlockSize = 4096; +const size_t Arena::kMaxBlockSize = 2 << 30; +static const int kAlignUnit = sizeof(void*); + +size_t OptimizeBlockSize(size_t block_size) { + // Make sure block_size is in optimal range + block_size = std::max(Arena::kMinBlockSize, block_size); + block_size = std::min(Arena::kMaxBlockSize, block_size); + + // make sure block_size is the multiple of kAlignUnit + if (block_size % kAlignUnit != 0) { + block_size = (1 + block_size / kAlignUnit) * kAlignUnit; + } + + return block_size; +} + +Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) { + assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && + kBlockSize % kAlignUnit == 0); +} + +Arena::~Arena() { + for (const auto& block : blocks_) { + delete[] block; + } +} + +char* Arena::AllocateFallback(size_t bytes, bool aligned) { + if (bytes > kBlockSize / 4) { + ++irregular_block_num; + // Object is more than a quarter of our block size. Allocate it separately + // to avoid wasting too much space in leftover bytes. + return AllocateNewBlock(bytes); + } + + // We waste the remaining space in the current block. + auto block_head = AllocateNewBlock(kBlockSize); + alloc_bytes_remaining_ = kBlockSize - bytes; + + if (aligned) { + aligned_alloc_ptr_ = block_head + bytes; + unaligned_alloc_ptr_ = block_head + kBlockSize; + return block_head; + } else { + aligned_alloc_ptr_ = block_head; + unaligned_alloc_ptr_ = block_head + kBlockSize - bytes; + return unaligned_alloc_ptr_; + } +} + +char* Arena::AllocateAligned(size_t bytes) { + assert((kAlignUnit & (kAlignUnit - 1)) == + 0); // Pointer size should be a power of 2 + size_t current_mod = + reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); + size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod); + size_t needed = bytes + slop; + char* result; + if (needed <= alloc_bytes_remaining_) { + result = aligned_alloc_ptr_ + slop; + aligned_alloc_ptr_ += needed; + alloc_bytes_remaining_ -= needed; + } else { + // AllocateFallback always returned aligned memory + result = AllocateFallback(bytes, true /* aligned */); + } + assert((reinterpret_cast(result) & (kAlignUnit - 1)) == 0); + return result; +} + +char* Arena::AllocateNewBlock(size_t block_bytes) { + char* block = new char[block_bytes]; + blocks_memory_ += block_bytes; + blocks_.push_back(block); + return block; +} + +} // namespace rocksdb diff --git a/util/arena.h b/util/arena.h new file mode 100644 index 00000000..6ce5a438 --- /dev/null +++ b/util/arena.h @@ -0,0 +1,100 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Arena is an implementation of Arena class. For a request of small size, +// it allocates a block with pre-defined block size. For a request of big +// size, it uses malloc to directly get the requested size. + +#pragma once +#include +#include +#include +#include +#include "util/arena.h" + +namespace rocksdb { + +class Arena { + public: + // No copying allowed + Arena(const Arena&) = delete; + void operator=(const Arena&) = delete; + + static const size_t kMinBlockSize; + static const size_t kMaxBlockSize; + + explicit Arena(size_t block_size = kMinBlockSize); + ~Arena(); + + char* Allocate(size_t bytes); + + char* AllocateAligned(size_t bytes); + + // Returns an estimate of the total memory usage of data allocated + // by the arena (exclude the space allocated but not yet used for future + // allocations). + size_t ApproximateMemoryUsage() const { + return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + alloc_bytes_remaining_; + } + + size_t MemoryAllocatedBytes() const { return blocks_memory_; } + + size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; } + + // If an allocation is too big, we'll allocate an irregular block with the + // same size of that allocation. + virtual size_t IrregularBlockNum() const { return irregular_block_num; } + + size_t BlockSize() const { return kBlockSize; } + + private: + // Number of bytes allocated in one block + const size_t kBlockSize; + // Array of new[] allocated memory blocks + typedef std::vector Blocks; + Blocks blocks_; + size_t irregular_block_num = 0; + + // Stats for current active block. + // For each block, we allocate aligned memory chucks from one end and + // allocate unaligned memory chucks from the other end. Otherwise the + // memory waste for alignment will be higher if we allocate both types of + // memory from one direction. + char* unaligned_alloc_ptr_ = nullptr; + char* aligned_alloc_ptr_ = nullptr; + // How many bytes left in currently active block? + size_t alloc_bytes_remaining_ = 0; + + char* AllocateFallback(size_t bytes, bool aligned); + char* AllocateNewBlock(size_t block_bytes); + + // Bytes of memory in blocks allocated so far + size_t blocks_memory_ = 0; +}; + +inline char* Arena::Allocate(size_t bytes) { + // The semantics of what to return are a bit messy if we allow + // 0-byte allocations, so we disallow them here (we don't need + // them for our internal use). + assert(bytes > 0); + if (bytes <= alloc_bytes_remaining_) { + unaligned_alloc_ptr_ -= bytes; + alloc_bytes_remaining_ -= bytes; + return unaligned_alloc_ptr_; + } + return AllocateFallback(bytes, false /* unaligned */); +} + +// check and adjust the block_size so that the return value is +// 1. in the range of [kMinBlockSize, kMaxBlockSize]. +// 2. the multiple of align unit. +extern size_t OptimizeBlockSize(size_t block_size); + +} // namespace rocksdb diff --git a/util/arena_test.cc b/util/arena_test.cc new file mode 100644 index 00000000..1b2b5317 --- /dev/null +++ b/util/arena_test.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/arena.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +class ArenaTest {}; + +TEST(ArenaTest, Empty) { Arena arena0; } + +TEST(ArenaTest, MemoryAllocatedBytes) { + const int N = 17; + size_t req_sz; // requested size + size_t bsz = 8192; // block size + size_t expected_memory_allocated; + + Arena arena(bsz); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 3001; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated = req_sz * N; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + + // requested size < quarter of a block: + // allocate a block with the default size, then try to use unused part + // of the block. So one new block will be allocated for the first + // Allocate(99) call. All the remaining calls won't lead to new allocation. + req_sz = 99; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated += bsz; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + + // requested size > quarter of a block: + // allocate requested size separately + req_sz = 99999999; + for (int i = 0; i < N; i++) { + arena.Allocate(req_sz); + } + expected_memory_allocated += req_sz * N; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); +} + +// Make sure we didn't count the allocate but not used memory space in +// Arena::ApproximateMemoryUsage() +TEST(ArenaTest, ApproximateMemoryUsageTest) { + const size_t kBlockSize = 4096; + const size_t kEntrySize = kBlockSize / 8; + const size_t kZero = 0; + Arena arena(kBlockSize); + ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); + + auto num_blocks = kBlockSize / kEntrySize; + + // first allocation + arena.AllocateAligned(kEntrySize); + auto mem_usage = arena.MemoryAllocatedBytes(); + ASSERT_EQ(mem_usage, kBlockSize); + auto usage = arena.ApproximateMemoryUsage(); + ASSERT_LT(usage, mem_usage); + for (size_t i = 1; i < num_blocks; ++i) { + arena.AllocateAligned(kEntrySize); + ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes()); + ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); + usage = arena.ApproximateMemoryUsage(); + } + ASSERT_GT(usage, mem_usage); +} + +TEST(ArenaTest, Simple) { + std::vector> allocated; + Arena arena; + const int N = 100000; + size_t bytes = 0; + Random rnd(301); + for (int i = 0; i < N; i++) { + size_t s; + if (i % (N / 10) == 0) { + s = i; + } else { + s = rnd.OneIn(4000) + ? rnd.Uniform(6000) + : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20)); + } + if (s == 0) { + // Our arena disallows size 0 allocations. + s = 1; + } + char* r; + if (rnd.OneIn(10)) { + r = arena.AllocateAligned(s); + } else { + r = arena.Allocate(s); + } + + for (unsigned int b = 0; b < s; b++) { + // Fill the "i"th allocation with a known bit pattern + r[b] = i % 256; + } + bytes += s; + allocated.push_back(std::make_pair(s, r)); + ASSERT_GE(arena.ApproximateMemoryUsage(), bytes); + if (i > N / 10) { + ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10); + } + } + for (unsigned int i = 0; i < allocated.size(); i++) { + size_t num_bytes = allocated[i].first; + const char* p = allocated[i].second; + for (unsigned int b = 0; b < num_bytes; b++) { + // Check the "i"th allocation for the known bit pattern + ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256)); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc new file mode 100644 index 00000000..06740434 --- /dev/null +++ b/util/auto_roll_logger.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/auto_roll_logger.h" +#include "util/mutexlock.h" + +using namespace std; + +namespace rocksdb { + +// -- AutoRollLogger +Status AutoRollLogger::ResetLogger() { + status_ = env_->NewLogger(log_fname_, &logger_); + + if (!status_.ok()) { + return status_; + } + + if (logger_->GetLogFileSize() == + (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) { + status_ = Status::NotSupported( + "The underlying logger doesn't support GetLogFileSize()"); + } + if (status_.ok()) { + cached_now = static_cast(env_->NowMicros() * 1e-6); + ctime_ = cached_now; + cached_now_access_count = 0; + } + + return status_; +} + +void AutoRollLogger::RollLogFile() { + std::string old_fname = OldInfoLogFileName( + dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_); + env_->RenameFile(log_fname_, old_fname); +} + +void AutoRollLogger::Logv(const char* format, va_list ap) { + assert(GetStatus().ok()); + + std::shared_ptr logger; + { + MutexLock l(&mutex_); + if ((kLogFileTimeToRoll > 0 && LogExpired()) || + (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { + RollLogFile(); + Status s = ResetLogger(); + if (!s.ok()) { + // can't really log the error if creating a new LOG file failed + return; + } + } + + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + + // Another thread could have put a new Logger instance into logger_ by now. + // However, since logger is still hanging on to the previous instance + // (reference count is not zero), we don't have to worry about it being + // deleted while we are accessing it. + // Note that logv itself is not mutex protected to allow maximum concurrency, + // as thread safety should have been handled by the underlying logger. + logger->Logv(format, ap); +} + +bool AutoRollLogger::LogExpired() { + if (cached_now_access_count >= call_NowMicros_every_N_records_) { + cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now_access_count = 0; + } + + ++cached_now_access_count; + return cached_now >= ctime_ + kLogFileTimeToRoll; +} + +Status CreateLoggerFromOptions( + const std::string& dbname, + const std::string& db_log_dir, + Env* env, + const Options& options, + std::shared_ptr* logger) { + std::string db_absolute_path; + env->GetAbsolutePath(dbname, &db_absolute_path); + std::string fname = InfoLogFileName(dbname, db_absolute_path, db_log_dir); + + // Currently we only support roll by time-to-roll and log size + if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { + AutoRollLogger* result = new AutoRollLogger( + env, dbname, db_log_dir, + options.max_log_file_size, + options.log_file_time_to_roll, options.info_log_level); + Status s = result->GetStatus(); + if (!s.ok()) { + delete result; + } else { + logger->reset(result); + } + return s; + } else { + // Open a log file in the same directory as the db + env->CreateDir(dbname); // In case it does not exist + env->RenameFile(fname, OldInfoLogFileName(dbname, env->NowMicros(), + db_absolute_path, db_log_dir)); + auto s = env->NewLogger(fname, logger); + if (logger->get() != nullptr) { + (*logger)->SetInfoLogLevel(options.info_log_level); + } + return s; + } +} + +} // namespace rocksdb diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h new file mode 100644 index 00000000..5fd7a147 --- /dev/null +++ b/util/auto_roll_logger.h @@ -0,0 +1,91 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include "db/filename.h" +#include "port/port.h" +#include "util/posix_logger.h" + +namespace rocksdb { + +// Rolls the log file by size and/or time +class AutoRollLogger : public Logger { + public: + AutoRollLogger(Env* env, const std::string& dbname, + const std::string& db_log_dir, size_t log_max_size, + size_t log_file_time_to_roll, + const InfoLogLevel log_level = InfoLogLevel::INFO) + : Logger(log_level), + dbname_(dbname), + db_log_dir_(db_log_dir), + env_(env), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + cached_now(static_cast(env_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + env->GetAbsolutePath(dbname, &db_absolute_path_); + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + RollLogFile(); + ResetLogger(); + } + + void Logv(const char* format, va_list ap); + + // check if the logger has encountered any problem. + Status GetStatus() { + return status_; + } + + size_t GetLogFileSize() const { + return logger_->GetLogFileSize(); + } + + virtual ~AutoRollLogger() { + } + + void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) { + call_NowMicros_every_N_records_ = call_NowMicros_every_N_records; + } + + private: + + bool LogExpired(); + Status ResetLogger(); + void RollLogFile(); + + std::string log_fname_; // Current active info log's file name. + std::string dbname_; + std::string db_log_dir_; + std::string db_absolute_path_; + Env* env_; + std::shared_ptr logger_; + // current status of the logger + Status status_; + const size_t kMaxLogFileSize; + const size_t kLogFileTimeToRoll; + // to avoid frequent env->NowMicros() calls, we cached the current time + uint64_t cached_now; + uint64_t ctime_; + uint64_t cached_now_access_count; + uint64_t call_NowMicros_every_N_records_; + port::Mutex mutex_; +}; + +// Facade to craete logger automatically +Status CreateLoggerFromOptions( + const std::string& dbname, + const std::string& db_log_dir, + Env* env, + const Options& options, + std::shared_ptr* logger); + +} // namespace rocksdb diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc new file mode 100755 index 00000000..7e30a277 --- /dev/null +++ b/util/auto_roll_logger_test.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include +#include +#include +#include +#include "util/testharness.h" +#include "util/auto_roll_logger.h" +#include "rocksdb/db.h" +#include +#include + +using namespace std; + +namespace rocksdb { + +class AutoRollLoggerTest { + public: + static void InitTestDb() { + string deleteCmd = "rm -rf " + kTestDir; + ASSERT_TRUE(system(deleteCmd.c_str()) == 0); + Env::Default()->CreateDir(kTestDir); + } + + void RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const string& log_message); + uint64_t RollLogFileByTimeTest(AutoRollLogger* logger, + size_t time, + const string& log_message); + + static const string kSampleMessage; + static const string kTestDir; + static const string kLogFile; + static Env* env; +}; + +const string AutoRollLoggerTest::kSampleMessage( + "this is the message to be written to the log file!!"); +const string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test"); +const string AutoRollLoggerTest::kLogFile(test::TmpDir() + "/db_log_test/LOG"); +Env* AutoRollLoggerTest::env = Env::Default(); + +// In this test we only want to Log some simple log message with +// no format. LogMessage() provides such a simple interface and +// avoids the [format-security] warning which occurs when you +// call Log(logger, log_message) directly. +void LogMessage(Logger* logger, const char* message) { + Log(logger, "%s", message); +} + +void LogMessage(const InfoLogLevel log_level, Logger* logger, + const char* message) { + Log(log_level, logger, "%s", message); +} + +void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + *file_ctime = (uint64_t)0; + } + *file_ctime = static_cast(s.st_ctime); +} + +void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, + size_t log_max_size, + const string& log_message) { + logger->SetInfoLogLevel(InfoLogLevel::INFO); + // measure the size of each message, which is supposed + // to be equal or greater than log_message.size() + LogMessage(logger, log_message.c_str()); + size_t message_size = logger->GetLogFileSize(); + size_t current_log_size = message_size; + + // Test the cases when the log file will not be rolled. + while (current_log_size + message_size < log_max_size) { + LogMessage(logger, log_message.c_str()); + current_log_size += message_size; + ASSERT_EQ(current_log_size, logger->GetLogFileSize()); + } + + // Now the log file will be rolled + LogMessage(logger, log_message.c_str()); + // Since rotation is checked before actual logging, we need to + // trigger the rotation by logging another message. + LogMessage(logger, log_message.c_str()); + + ASSERT_TRUE(message_size == logger->GetLogFileSize()); +} + +uint64_t AutoRollLoggerTest::RollLogFileByTimeTest( + AutoRollLogger* logger, size_t time, const string& log_message) { + uint64_t expected_create_time; + uint64_t actual_create_time; + uint64_t total_log_size; + ASSERT_OK(env->GetFileSize(kLogFile, &total_log_size)); + GetFileCreateTime(kLogFile, &expected_create_time); + logger->SetCallNowMicrosEveryNRecords(0); + + // -- Write to the log for several times, which is supposed + // to be finished before time. + for (int i = 0; i < 10; ++i) { + LogMessage(logger, log_message.c_str()); + ASSERT_OK(logger->GetStatus()); + // Make sure we always write to the same log file (by + // checking the create time); + GetFileCreateTime(kLogFile, &actual_create_time); + + // Also make sure the log size is increasing. + ASSERT_EQ(expected_create_time, actual_create_time); + ASSERT_GT(logger->GetLogFileSize(), total_log_size); + total_log_size = logger->GetLogFileSize(); + } + + // -- Make the log file expire + sleep(time); + LogMessage(logger, log_message.c_str()); + + // At this time, the new log file should be created. + GetFileCreateTime(kLogFile, &actual_create_time); + ASSERT_GT(actual_create_time, expected_create_time); + ASSERT_LT(logger->GetLogFileSize(), total_log_size); + expected_create_time = actual_create_time; + + return expected_create_time; +} + +TEST(AutoRollLoggerTest, RollLogFileBySize) { + InitTestDb(); + size_t log_max_size = 1024 * 5; + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0); + + RollLogFileBySizeTest(&logger, log_max_size, + kSampleMessage + ":RollLogFileBySize"); +} + +TEST(AutoRollLoggerTest, RollLogFileByTime) { + size_t time = 1; + size_t log_size = 1024 * 5; + + InitTestDb(); + // -- Test the existence of file during the server restart. + ASSERT_TRUE(!env->FileExists(kLogFile)); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 1); + ASSERT_TRUE(env->FileExists(kLogFile)); + + RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime"); +} + +TEST(AutoRollLoggerTest, + OpenLogFilesMultipleTimesWithOptionLog_max_size) { + // If only 'log_max_size' options is specified, then every time + // when rocksdb is restarted, a new empty log file will be created. + InitTestDb(); + // WORKAROUND: + // avoid complier's complaint of "comparison between signed + // and unsigned integer expressions" because literal 0 is + // treated as "singed". + size_t kZero = 0; + size_t log_size = 1024; + + AutoRollLogger* logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + + LogMessage(logger, kSampleMessage.c_str()); + ASSERT_GT(logger->GetLogFileSize(), kZero); + delete logger; + + // reopens the log file and an empty log file will be created. + logger = new AutoRollLogger( + Env::Default(), kTestDir, "", log_size, 0); + ASSERT_EQ(logger->GetLogFileSize(), kZero); + delete logger; +} + +TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { + size_t time = 1, log_max_size = 1024 * 5; + + InitTestDb(); + + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, time); + + // Test the ability to roll by size + RollLogFileBySizeTest( + &logger, log_max_size, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); + + // Test the ability to roll by Time + RollLogFileByTimeTest( &logger, time, + kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); +} + +TEST(AutoRollLoggerTest, CreateLoggerFromOptions) { + Options options; + shared_ptr logger; + + // Normal logger + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + ASSERT_TRUE(dynamic_cast(logger.get())); + + // Only roll by size + InitTestDb(); + options.max_log_file_size = 1024; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast(logger.get()); + ASSERT_TRUE(auto_roll_logger); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - size"); + + // Only roll by Time + InitTestDb(); + options.max_log_file_size = 0; + options.log_file_time_to_roll = 1; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + auto_roll_logger = + dynamic_cast(logger.get()); + RollLogFileByTimeTest( + auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - time"); + + // roll by both Time and size + InitTestDb(); + options.max_log_file_size = 1024 * 5; + options.log_file_time_to_roll = 1; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, "", env, options, &logger)); + auto_roll_logger = + dynamic_cast(logger.get()); + RollLogFileBySizeTest( + auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - both"); + RollLogFileByTimeTest( + auto_roll_logger, options.log_file_time_to_roll, + kSampleMessage + ":CreateLoggerFromOptions - both"); +} + +TEST(AutoRollLoggerTest, InfoLogLevel) { + InitTestDb(); + + size_t log_size = 8192; + size_t log_lines = 0; + // an extra-scope to force the AutoRollLogger to flush the log file when it + // becomes out of scope. + { + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + for (int log_level = InfoLogLevel::FATAL; log_level >= InfoLogLevel::DEBUG; + log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + for (int log_type = InfoLogLevel::DEBUG; log_type <= InfoLogLevel::FATAL; + log_type++) { + // log messages with log level smaller than log_level will not be + // logged. + LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str()); + } + log_lines += InfoLogLevel::FATAL - log_level + 1; + } + for (int log_level = InfoLogLevel::FATAL; log_level >= InfoLogLevel::DEBUG; + log_level--) { + logger.SetInfoLogLevel((InfoLogLevel)log_level); + + // again, messages with level smaller than log_level will not be logged. + Debug(&logger, "%s", kSampleMessage.c_str()); + Info(&logger, "%s", kSampleMessage.c_str()); + Warn(&logger, "%s", kSampleMessage.c_str()); + Error(&logger, "%s", kSampleMessage.c_str()); + Fatal(&logger, "%s", kSampleMessage.c_str()); + log_lines += InfoLogLevel::FATAL - log_level + 1; + } + } + std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); + size_t lines = std::count(std::istreambuf_iterator(inFile), + std::istreambuf_iterator(), '\n'); + ASSERT_EQ(log_lines, lines); + inFile.close(); +} + +int OldLogFileCount(const string& dir) { + std::vector files; + Env::Default()->GetChildren(dir, &files); + int log_file_count = 0; + + for (std::vector::iterator it = files.begin(); + it != files.end(); ++it) { + uint64_t create_time; + FileType type; + if (!ParseFileName(*it, &create_time, &type)) { + continue; + } + if (type == kInfoLogFile && create_time > 0) { + ++log_file_count; + } + } + + return log_file_count; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/autovector.h b/util/autovector.h new file mode 100644 index 00000000..812a6179 --- /dev/null +++ b/util/autovector.h @@ -0,0 +1,303 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include + +namespace rocksdb { + +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit()/resize() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector::difference_type difference_type; + typedef typename std::vector::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect), index_(index) {}; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() {} + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // iterator++ + self_type& operator++() { + ++index_; + return *this; + } + + // ++iterator + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // iterator-- + self_type& operator--() { + --index_; + return *this; + } + + // --iterator + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { return !(*this == other); } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl iterator; + typedef iterator_impl const_iterator; + typedef std::reverse_iterator reverse_iterator; + typedef std::reverse_iterator const_reverse_iterator; + + autovector() = default; + ~autovector() = default; + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { return num_stack_items_ + vect_.size(); } + + bool empty() const { return size() == 0; } + + // will not check boundry + const_reference operator[](size_type n) const { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + reference operator[](size_type n) { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + // will check boundry + const_reference at(size_type n) const { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference at(size_type n) { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { push_back(value_type(item)); } + + template + void emplace_back(Args&&... args) { + push_back(value_type(args...)); + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + --num_stack_items_; + } + } + + void clear() { + num_stack_items_ = 0; + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { assign(other); } + + autovector& operator=(const autovector& other) { return assign(other); } + + // move operation are disallowed since it is very hard to make sure both + // autovectors are allocated from the same function stack. + autovector& operator=(autovector&& other) = delete; + autovector(autovector&& other) = delete; + + // -- Iterator Operations + iterator begin() { return iterator(this, 0); } + + const_iterator begin() const { return const_iterator(this, 0); } + + iterator end() { return iterator(this, this->size()); } + + const_iterator end() const { return const_iterator(this, this->size()); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { return reverse_iterator(begin()); } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign(const autovector& other) { + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +} // rocksdb diff --git a/util/autovector_test.cc b/util/autovector_test.cc new file mode 100644 index 00000000..eb244aab --- /dev/null +++ b/util/autovector_test.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +using namespace std; + +class AutoVectorTest { }; + +const unsigned long kSize = 8; +TEST(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + ASSERT_TRUE(!vec.only_in_stack()); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST(AutoVectorTest, EmplaceBack) { + typedef std::pair ValueType; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + ASSERT_TRUE(!vec.only_in_stack()); +} + +void AssertEqual( + const autovector& a, const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} + +TEST(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); + } +} + +vector GetTestKeys(size_t size) { + vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + to_string(index++); + } + return keys; +} + +template +void BenchmarkVectorCreationAndInsertion( + string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// o our use cases we used std::vector. +TEST(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, string_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys + ); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, int_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess>( + "vector", kOps, elem_size + ); + BenchmarkSequenceAccess>( + "autovector", kOps, elem_size + ); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/bit_set.h b/util/bit_set.h new file mode 100644 index 00000000..01727060 --- /dev/null +++ b/util/bit_set.h @@ -0,0 +1,71 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include + +namespace rocksdb { + +class BitSet { + public: + /** + * Create a bit set of numBits, with the bits set to either true or false. + */ + explicit BitSet(size_t numBits, bool initial=false) + : numBits_(numBits), + data_(numWords(), initial ? ~0UL : 0UL) { + } + + /** + * Set bit b to 1. + */ + void set(size_t b) { + assert(b >= 0 && b < numBits_); + data_[word(b)] |= wordOffsetMask(b); + } + + /** + * Set bit b to 0; + */ + void reset(size_t b) { + assert(b >= 0 && b < numBits_); + data_[word(b)] &= ~wordOffsetMask(b); + } + + /** + * Get a bit. + */ + bool test(int b) const { + return data_[word(b)] & wordOffsetMask(b); + } + + /** + * Return the size of the BitSet, in bits. + */ + size_t size() const { + return numBits_; + } + + private: + + inline size_t numWords() const { + if (numBits_ == 0) return 0; + return 1 + (numBits_-1) / (8*sizeof(unsigned long)); + } + inline static size_t word(int b) { + return b / (8*sizeof(unsigned long)); + } + inline static int wordOffset(int b) { + return b % (8*sizeof(unsigned long)); + } + inline static unsigned long wordOffsetMask(int b) { + return 1UL << wordOffset(b); + } + + size_t numBits_; + std::vector data_; +}; + +} // namespace facebook diff --git a/util/blob_store.cc b/util/blob_store.cc new file mode 100644 index 00000000..76230679 --- /dev/null +++ b/util/blob_store.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/blob_store.h" + +namespace rocksdb { + +using namespace std; + +// BlobChunk +bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const { + // overlapping!? + assert(!Overlap(chunk)); + // size == 0 is a marker, not a block + return size != 0 && + bucket_id == chunk.bucket_id && + offset + size == chunk.offset; +} + +bool BlobChunk::Overlap(const BlobChunk &chunk) const { + return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id && + ((offset >= chunk.offset && offset < chunk.offset + chunk.size) || + (chunk.offset >= offset && chunk.offset < offset + size)); +} + +// Blob +string Blob::ToString() const { + string ret; + for (auto chunk : chunks) { + PutFixed32(&ret, chunk.bucket_id); + PutFixed32(&ret, chunk.offset); + PutFixed32(&ret, chunk.size); + } + return ret; +} + +Blob::Blob(const std::string& blob) { + for (uint32_t i = 0; i < blob.size(); ) { + uint32_t t[3] = {0}; + for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size(); + ++j, i += sizeof(uint32_t)) { + t[j] = DecodeFixed32(blob.data() + i); + } + chunks.push_back(BlobChunk(t[0], t[1], t[2])); + } +} + +// FreeList +Status FreeList::Free(const Blob& blob) { + // add it back to the free list + for (auto chunk : blob.chunks) { + free_blocks_ += chunk.size; + if (fifo_free_chunks_.size() && + fifo_free_chunks_.back().ImmediatelyBefore(chunk)) { + fifo_free_chunks_.back().size += chunk.size; + } else { + fifo_free_chunks_.push_back(chunk); + } + } + + return Status::OK(); +} + +Status FreeList::Allocate(uint32_t blocks, Blob* blob) { + if (free_blocks_ < blocks) { + return Status::Incomplete(""); + } + + blob->chunks.clear(); + free_blocks_ -= blocks; + + while (blocks > 0) { + assert(fifo_free_chunks_.size() > 0); + auto& front = fifo_free_chunks_.front(); + if (front.size > blocks) { + blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks)); + front.offset += blocks; + front.size -= blocks; + blocks = 0; + } else { + blob->chunks.push_back(front); + blocks -= front.size; + fifo_free_chunks_.pop_front(); + } + } + assert(blocks == 0); + + return Status::OK(); +} + +bool FreeList::Overlap(const Blob &blob) const { + for (auto chunk : blob.chunks) { + for (auto itr = fifo_free_chunks_.begin(); + itr != fifo_free_chunks_.end(); + ++itr) { + if (itr->Overlap(chunk)) { + return true; + } + } + } + return false; +} + +// BlobStore +BlobStore::BlobStore(const string& directory, + uint64_t block_size, + uint32_t blocks_per_bucket, + uint32_t max_buckets, + Env* env) : + directory_(directory), + block_size_(block_size), + blocks_per_bucket_(blocks_per_bucket), + env_(env), + max_buckets_(max_buckets) { + env_->CreateDirIfMissing(directory_); + + storage_options_.use_mmap_writes = false; + storage_options_.use_mmap_reads = false; + + buckets_size_ = 0; + buckets_ = new unique_ptr[max_buckets_]; + + CreateNewBucket(); +} + +BlobStore::~BlobStore() { + // TODO we don't care about recovery for now + delete [] buckets_; +} + +Status BlobStore::Put(const Slice& value, Blob* blob) { + // convert size to number of blocks + Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob); + if (!s.ok()) { + return s; + } + auto size_left = (uint64_t) value.size(); + + uint64_t offset = 0; // in bytes, not blocks + for (auto chunk : blob->chunks) { + uint64_t write_size = min(chunk.size * block_size_, size_left); + assert(chunk.bucket_id < buckets_size_); + s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_, + Slice(value.data() + offset, + write_size)); + if (!s.ok()) { + Delete(*blob); + return s; + } + offset += write_size; + size_left -= write_size; + if (write_size < chunk.size * block_size_) { + // if we have any space left in the block, fill it up with zeros + string zero_string(chunk.size * block_size_ - write_size, 0); + s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ + + write_size, + Slice(zero_string)); + } + } + + if (size_left > 0) { + Delete(*blob); + return Status::Corruption("Tried to write more data than fits in the blob"); + } + + return Status::OK(); +} + +Status BlobStore::Get(const Blob& blob, + string* value) const { + { + // assert that it doesn't overlap with free list + // it will get compiled out for release + MutexLock l(&free_list_mutex_); + assert(!free_list_.Overlap(blob)); + } + + value->resize(blob.Size() * block_size_); + + uint64_t offset = 0; // in bytes, not blocks + for (auto chunk : blob.chunks) { + Slice result; + assert(chunk.bucket_id < buckets_size_); + Status s; + s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_, + chunk.size * block_size_, + &result, + &value->at(offset)); + if (!s.ok()) { + value->clear(); + return s; + } + if (result.size() < chunk.size * block_size_) { + value->clear(); + return Status::Corruption("Could not read in from file"); + } + offset += chunk.size * block_size_; + } + + // remove the '\0's at the end of the string + value->erase(find(value->begin(), value->end(), '\0'), value->end()); + + return Status::OK(); +} + +Status BlobStore::Delete(const Blob& blob) { + MutexLock l(&free_list_mutex_); + return free_list_.Free(blob); +} + +Status BlobStore::Sync() { + for (size_t i = 0; i < buckets_size_; ++i) { + Status s = buckets_[i].get()->Sync(); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +Status BlobStore::Allocate(uint32_t blocks, Blob* blob) { + MutexLock l(&free_list_mutex_); + Status s; + + s = free_list_.Allocate(blocks, blob); + if (!s.ok()) { + s = CreateNewBucket(); + if (!s.ok()) { + return s; + } + s = free_list_.Allocate(blocks, blob); + } + + return s; +} + +// called with free_list_mutex_ held +Status BlobStore::CreateNewBucket() { + MutexLock l(&buckets_mutex_); + + if (buckets_size_ >= max_buckets_) { + return Status::NotSupported("Max size exceeded\n"); + } + + int new_bucket_id = buckets_size_; + + char fname[200]; + sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id); + + Status s = env_->NewRandomRWFile(string(fname), + &buckets_[new_bucket_id], + storage_options_); + if (!s.ok()) { + return s; + } + + // whether Allocate succeeds or not, does not affect the overall correctness + // of this function - calling Allocate is really optional + // (also, tmpfs does not support allocate) + buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_); + + buckets_size_ = new_bucket_id + 1; + + return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_)); +} + +} // namespace rocksdb diff --git a/util/blob_store.h b/util/blob_store.h new file mode 100644 index 00000000..0a81d01d --- /dev/null +++ b/util/blob_store.h @@ -0,0 +1,161 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/coding.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +struct BlobChunk { + uint32_t bucket_id; + uint32_t offset; // in blocks + uint32_t size; // in blocks + BlobChunk() {} + BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) : + bucket_id(bucket_id), offset(offset), size(size) {} + + // returns true if it's immediately before chunk + bool ImmediatelyBefore(const BlobChunk& chunk) const; + // returns true if chunks overlap + bool Overlap(const BlobChunk &chunk) const; +}; + +// We represent each Blob as a string in format: +// bucket_id offset size|bucket_id offset size... +// The string can be used to reference the Blob stored on external +// device/file +// Not thread-safe! +struct Blob { + // Generates the string + std::string ToString() const; + // Parses the previously generated string + explicit Blob(const std::string& blob); + // Creates unfragmented Blob + Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) { + SetOneChunk(bucket_id, offset, size); + } + Blob() {} + + void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) { + chunks.clear(); + chunks.push_back(BlobChunk(bucket_id, offset, size)); + } + + uint32_t Size() const { // in blocks + uint32_t ret = 0; + for (auto chunk : chunks) { + ret += chunk.size; + } + assert(ret > 0); + return ret; + } + + // bucket_id, offset, size + std::vector chunks; +}; + +// Keeps a list of free chunks +// NOT thread-safe. Externally synchronized +class FreeList { + public: + FreeList() : + free_blocks_(0) {} + ~FreeList() {} + + // Allocates a a blob. Stores the allocated blob in + // 'blob'. Returns non-OK status if it failed to allocate. + // Thread-safe + Status Allocate(uint32_t blocks, Blob* blob); + // Frees the blob for reuse. Thread-safe + Status Free(const Blob& blob); + + // returns true if blob is overlapping with any of the + // chunks stored in free list + bool Overlap(const Blob &blob) const; + + private: + std::deque fifo_free_chunks_; + uint32_t free_blocks_; + mutable port::Mutex mutex_; +}; + +// thread-safe +class BlobStore { + public: + // directory - wherever the blobs should be stored. It will be created + // if missing + // block_size - self explanatory + // blocks_per_bucket - how many blocks we want to keep in one bucket. + // Bucket is a device or a file that we use to store the blobs. + // If we don't have enough blocks to allocate a new blob, we will + // try to create a new file or device. + // max_buckets - maximum number of buckets BlobStore will create + // BlobStore max size in bytes is + // max_buckets * blocks_per_bucket * block_size + // env - env for creating new files + BlobStore(const std::string& directory, + uint64_t block_size, + uint32_t blocks_per_bucket, + uint32_t max_buckets, + Env* env); + ~BlobStore(); + + // Allocates space for value.size bytes (rounded up to be multiple of + // block size) and writes value.size bytes from value.data to a backing store. + // Sets Blob blob that can than be used for addressing the + // stored value. Returns non-OK status on error. + Status Put(const Slice& value, Blob* blob); + // Value needs to have enough space to store all the loaded stuff. + // This function is thread safe! + Status Get(const Blob& blob, std::string* value) const; + // Frees the blob for reuse, but does not delete the data + // on the backing store. + Status Delete(const Blob& blob); + // Sync all opened files that are modified + Status Sync(); + + private: + const std::string directory_; + // block_size_ is uint64_t because when we multiply with + // blocks_size_ we want the result to be uint64_t or + // we risk overflowing + const uint64_t block_size_; + const uint32_t blocks_per_bucket_; + Env* env_; + EnvOptions storage_options_; + // protected by free_list_mutex_ + FreeList free_list_; + // free_list_mutex_ is locked BEFORE buckets_mutex_ + mutable port::Mutex free_list_mutex_; + // protected by buckets_mutex_ + // array of buckets + unique_ptr* buckets_; + // number of buckets in the array + uint32_t buckets_size_; + uint32_t max_buckets_; + mutable port::Mutex buckets_mutex_; + + // Calls FreeList allocate. If free list can't allocate + // new blob, creates new bucket and tries again + // Thread-safe + Status Allocate(uint32_t blocks, Blob* blob); + + // Creates a new backing store and adds all the blocks + // from the new backing store to the free list + Status CreateNewBucket(); +}; + +} // namespace rocksdb diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc new file mode 100644 index 00000000..f199f5dd --- /dev/null +++ b/util/blob_store_test.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/blob_store.h" + +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/random.h" + +#include +#include + +namespace rocksdb { + +using namespace std; + +class BlobStoreTest { }; + +TEST(BlobStoreTest, RangeParseTest) { + Blob e; + for (int i = 0; i < 5; ++i) { + e.chunks.push_back(BlobChunk(rand(), rand(), rand())); + } + string x = e.ToString(); + Blob nx(x); + + ASSERT_EQ(nx.ToString(), x); +} + +// make sure we're reusing the freed space +TEST(BlobStoreTest, SanityTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 20; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 1000, + Env::Default()); + + string buf; + + // put string of size 170 + test::RandomString(&random, 170, &buf); + Blob r1; + ASSERT_OK(blob_store.Put(Slice(buf), &r1)); + // use the first file + for (size_t i = 0; i < r1.chunks.size(); ++i) { + ASSERT_EQ(r1.chunks[0].bucket_id, 0u); + } + + // put string of size 30 + test::RandomString(&random, 30, &buf); + Blob r2; + ASSERT_OK(blob_store.Put(Slice(buf), &r2)); + // use the first file + for (size_t i = 0; i < r2.chunks.size(); ++i) { + ASSERT_EQ(r2.chunks[0].bucket_id, 0u); + } + + // delete blob of size 170 + ASSERT_OK(blob_store.Delete(r1)); + + // put a string of size 100 + test::RandomString(&random, 100, &buf); + Blob r3; + ASSERT_OK(blob_store.Put(Slice(buf), &r3)); + // use the first file + for (size_t i = 0; i < r3.chunks.size(); ++i) { + ASSERT_EQ(r3.chunks[0].bucket_id, 0u); + } + + // put a string of size 70 + test::RandomString(&random, 70, &buf); + Blob r4; + ASSERT_OK(blob_store.Put(Slice(buf), &r4)); + // use the first file + for (size_t i = 0; i < r4.chunks.size(); ++i) { + ASSERT_EQ(r4.chunks[0].bucket_id, 0u); + } + + // put a string of size 5 + test::RandomString(&random, 5, &buf); + Blob r5; + ASSERT_OK(blob_store.Put(Slice(buf), &r5)); + // now you get to use the second file + for (size_t i = 0; i < r5.chunks.size(); ++i) { + ASSERT_EQ(r5.chunks[0].bucket_id, 1u); + } +} + +TEST(BlobStoreTest, FragmentedChunksTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 20; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 1000, + Env::Default()); + + string buf; + + vector r(4); + + // put 4 strings of size 50 + for (int k = 0; k < 4; ++k) { + test::RandomString(&random, 50, &buf); + ASSERT_OK(blob_store.Put(Slice(buf), &r[k])); + // use the first file + for (size_t i = 0; i < r[k].chunks.size(); ++i) { + ASSERT_EQ(r[k].chunks[0].bucket_id, 0u); + } + } + + // delete the first and third + ASSERT_OK(blob_store.Delete(r[0])); + ASSERT_OK(blob_store.Delete(r[2])); + + // put string of size 100. it should reuse space that we deleting + // by deleting first and third strings of size 50 + test::RandomString(&random, 100, &buf); + Blob r2; + ASSERT_OK(blob_store.Put(Slice(buf), &r2)); + // use the first file + for (size_t i = 0; i < r2.chunks.size(); ++i) { + ASSERT_EQ(r2.chunks[0].bucket_id, 0u); + } +} + +TEST(BlobStoreTest, CreateAndStoreTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 1000; + const int max_blurb_size = 300; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + 10000, + Env::Default()); + vector> ranges; + + for (int i = 0; i < 2000; ++i) { + int decision = rand() % 5; + if (decision <= 2 || ranges.size() == 0) { + string buf; + int size_blocks = (rand() % max_blurb_size + 1); + int string_size = size_blocks * block_size - (rand() % block_size); + test::RandomString(&random, string_size, &buf); + Blob r; + ASSERT_OK(blob_store.Put(Slice(buf), &r)); + ranges.push_back(make_pair(r, buf)); + } else if (decision == 3) { + int ti = rand() % ranges.size(); + string out_buf; + ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf)); + ASSERT_EQ(ranges[ti].second, out_buf); + } else { + int ti = rand() % ranges.size(); + ASSERT_OK(blob_store.Delete(ranges[ti].first)); + ranges.erase(ranges.begin() + ti); + } + } + ASSERT_OK(blob_store.Sync()); +} + +TEST(BlobStoreTest, MaxSizeTest) { + const uint64_t block_size = 10; + const uint32_t blocks_per_file = 100; + const int max_buckets = 10; + Random random(5); + + BlobStore blob_store(test::TmpDir() + "/blob_store_test", + block_size, + blocks_per_file, + max_buckets, + Env::Default()); + string buf; + for (int i = 0; i < max_buckets; ++i) { + test::RandomString(&random, 1000, &buf); + Blob r; + ASSERT_OK(blob_store.Put(Slice(buf), &r)); + } + + test::RandomString(&random, 1000, &buf); + Blob r; + // should fail because max size + Status s = blob_store.Put(Slice(buf), &r); + ASSERT_EQ(s.ok(), false); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/bloom.cc b/util/bloom.cc new file mode 100644 index 00000000..78ae04a2 --- /dev/null +++ b/util/bloom.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +class BloomFilterPolicy : public FilterPolicy { + private: + size_t bits_per_key_; + size_t k_; + uint32_t (*hash_func_)(const Slice& key); + + void initialize() { + // We intentionally round down to reduce probing cost a little bit + k_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (k_ < 1) k_ = 1; + if (k_ > 30) k_ = 30; + } + + public: + explicit BloomFilterPolicy(int bits_per_key, + uint32_t (*hash_func)(const Slice& key)) + : bits_per_key_(bits_per_key), hash_func_(hash_func) { + initialize(); + } + explicit BloomFilterPolicy(int bits_per_key) + : bits_per_key_(bits_per_key) { + hash_func_ = BloomHash; + initialize(); + } + + virtual const char* Name() const { + return "rocksdb.BuiltinBloomFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + // Compute bloom filter size (in both bits and bytes) + size_t bits = n * bits_per_key_; + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + size_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(k_)); // Remember # of probes in filter + char* array = &(*dst)[init_size]; + for (size_t i = 0; i < (size_t)n; i++) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + uint32_t h = hash_func_(keys[i]); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k_; j++) { + const uint32_t bitpos = h % bits; + array[bitpos/8] |= (1 << (bitpos % 8)); + h += delta; + } + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2) return false; + + const char* array = bloom_filter.data(); + const size_t bits = (len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const size_t k = array[len-1]; + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + + uint32_t h = hash_func_(key); + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (size_t j = 0; j < k; j++) { + const uint32_t bitpos = h % bits; + if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; + h += delta; + } + return true; + } +}; +} + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { + return new BloomFilterPolicy(bits_per_key); +} + +} // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc new file mode 100644 index 00000000..2c430e20 --- /dev/null +++ b/util/bloom_test.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "rocksdb/filter_policy.h" + +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +DEFINE_int32(bits_per_key, 10, ""); + +namespace rocksdb { + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class BloomTest { + private: + const FilterPolicy* policy_; + std::string filter_; + std::vector keys_; + + public: + BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { } + + ~BloomTest() { + delete policy_; + } + + void Reset() { + keys_.clear(); + filter_.clear(); + } + + void Add(const Slice& s) { + keys_.push_back(s.ToString()); + } + + void Build() { + std::vector key_slices; + for (size_t i = 0; i < keys_.size(); i++) { + key_slices.push_back(Slice(keys_[i])); + } + filter_.clear(); + policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); + keys_.clear(); + if (kVerbose >= 2) DumpFilter(); + } + + size_t FilterSize() const { + return filter_.size(); + } + + void DumpFilter() { + fprintf(stderr, "F("); + for (size_t i = 0; i+1 < filter_.size(); i++) { + const unsigned int c = static_cast(filter_[i]); + for (int j = 0; j < 8; j++) { + fprintf(stderr, "%c", (c & (1 <KeyMayMatch(s, filter_); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(BloomTest, EmptyFilter) { + ASSERT_TRUE(! Matches("hello")); + ASSERT_TRUE(! Matches("world")); +} + +TEST(BloomTest, Small) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(! Matches("x")); + ASSERT_TRUE(! Matches("foo")); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +TEST(BloomTest, VaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often + else good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + +// Different bits-per-byte + +} // namespace rocksdb + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + + return rocksdb::test::RunAllTests(); +} diff --git a/util/build_version.h b/util/build_version.h new file mode 100644 index 00000000..3d2ed291 --- /dev/null +++ b/util/build_version.h @@ -0,0 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +// these variables tell us about the git config and time +extern const char* rocksdb_build_git_sha; + +// these variables tell us when the compilation occurred +extern const char* rocksdb_build_compile_time; +extern const char* rocksdb_build_compile_date; + diff --git a/util/cache.cc b/util/cache.cc new file mode 100644 index 00000000..8f7deaaa --- /dev/null +++ b/util/cache.cc @@ -0,0 +1,457 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "rocksdb/cache.h" +#include "port/port.h" +#include "util/autovector.h" +#include "util/hash.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +Cache::~Cache() { +} + +namespace { + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next_hash; + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + uint32_t refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle* Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); + } + + LRUHandle* Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h->key(), h->hash); + LRUHandle* old = *ptr; + h->next_hash = (old == nullptr ? nullptr : old->next_hash); + *ptr = h; + if (old == nullptr) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; + } + + LRUHandle* Remove(const Slice& key, uint32_t hash) { + LRUHandle** ptr = FindPointer(key, hash); + LRUHandle* result = *ptr; + if (result != nullptr) { + *ptr = result->next_hash; + --elems_; + } + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle** list_; + + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(const Slice& key, uint32_t hash) { + LRUHandle** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != nullptr && + ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; + } + + void Resize() { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { + new_length *= 2; + } + LRUHandle** new_list = new LRUHandle*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (uint32_t i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + LRUHandle* next = h->next_hash; + uint32_t hash = h->hash; + LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + assert(elems_ == count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; + +// A single shard of sharded cache. +class LRUCache { + public: + LRUCache(); + ~LRUCache(); + + // Separate from constructor so caller can easily make an array of LRUCache + void SetCapacity(size_t capacity) { capacity_ = capacity; } + void SetRemoveScanCountLimit(size_t remove_scan_count_limit) { + remove_scan_count_limit_ = remove_scan_count_limit; + } + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)); + Cache::Handle* Lookup(const Slice& key, uint32_t hash); + void Release(Cache::Handle* handle); + void Erase(const Slice& key, uint32_t hash); + // Although in some platforms the update of size_t is atomic, to make sure + // GetUsage() works correctly under any platforms, we'll protect this + // function with mutex. + size_t GetUsage() const { + MutexLock l(&mutex_); + return usage_; + } + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + // Just reduce the reference count by 1. + // Return true if last reference + bool Unref(LRUHandle* e); + // Call deleter and free + void FreeEntry(LRUHandle* e); + + // Initialized before use. + size_t capacity_; + uint32_t remove_scan_count_limit_; + + // mutex_ protects the following state. + // We don't count mutex_ as the cache's internal state so semantically we + // don't mind mutex_ invoking the non-const actions. + mutable port::Mutex mutex_; + size_t usage_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; +}; + +LRUCache::LRUCache() + : usage_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + assert(e->refs == 1); // Error if caller has an unreleased handle + if (Unref(e)) { + FreeEntry(e); + } + e = next; + } +} + +bool LRUCache::Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + return e->refs == 0; +} + +void LRUCache::FreeEntry(LRUHandle* e) { + assert(e->refs == 0); + (*e->deleter)(e->key(), e->value); + free(e); +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; + usage_ -= e->charge; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + usage_ += e->charge; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { + MutexLock l(&mutex_); + LRUHandle* e = table_.Lookup(key, hash); + if (e != nullptr) { + e->refs++; + LRU_Remove(e); + LRU_Append(e); + } + return reinterpret_cast(e); +} + +void LRUCache::Release(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + bool last_reference = false; + { + MutexLock l(&mutex_); + last_reference = Unref(e); + } + if (last_reference) { + FreeEntry(e); + } +} + +Cache::Handle* LRUCache::Insert( + const Slice& key, uint32_t hash, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + autovector last_reference_list; + + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + + { + MutexLock l(&mutex_); + + LRU_Append(e); + + LRUHandle* old = table_.Insert(e); + if (old != nullptr) { + LRU_Remove(old); + if (Unref(old)) { + last_reference_list.push_back(old); + } + } + + if (remove_scan_count_limit_ > 0) { + // Try to free the space by evicting the entries that are only + // referenced by the cache first. + LRUHandle* cur = lru_.next; + for (unsigned int scanCount = 0; + usage_ > capacity_ && cur != &lru_ + && scanCount < remove_scan_count_limit_; scanCount++) { + LRUHandle* next = cur->next; + if (cur->refs <= 1) { + LRU_Remove(cur); + table_.Remove(cur->key(), cur->hash); + if (Unref(cur)) { + last_reference_list.push_back(cur); + } + } + cur = next; + } + } + + // Free the space following strict LRU policy until enough space + // is freed. + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + if (Unref(old)) { + last_reference_list.push_back(old); + } + } + } + + // we free the entries here outside of mutex for + // performance reasons + for (auto entry : last_reference_list) { + FreeEntry(entry); + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key, uint32_t hash) { + LRUHandle* e; + bool last_reference = false; + { + MutexLock l(&mutex_); + e = table_.Remove(key, hash); + if (e != nullptr) { + LRU_Remove(e); + last_reference = Unref(e); + } + } + // mutex not held here + // last_reference will only be true if e != nullptr + if (last_reference) { + FreeEntry(e); + } +} + +static int kNumShardBits = 4; // default values, can be overridden +static int kRemoveScanCountLimit = 0; // default values, can be overridden + +class ShardedLRUCache : public Cache { + private: + LRUCache* shards_; + port::Mutex id_mutex_; + uint64_t last_id_; + int num_shard_bits_; + size_t capacity_; + + static inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } + + uint32_t Shard(uint32_t hash) { + // Note, hash >> 32 yields hash in gcc, not the zero we expect! + return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; + } + + void init(size_t capacity, int numbits, int removeScanCountLimit) { + num_shard_bits_ = numbits; + capacity_ = capacity; + int num_shards = 1 << num_shard_bits_; + shards_ = new LRUCache[num_shards]; + const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; + for (int s = 0; s < num_shards; s++) { + shards_[s].SetCapacity(per_shard); + shards_[s].SetRemoveScanCountLimit(removeScanCountLimit); + } + } + + public: + explicit ShardedLRUCache(size_t capacity) + : last_id_(0) { + init(capacity, kNumShardBits, kRemoveScanCountLimit); + } + ShardedLRUCache(size_t capacity, int num_shard_bits, + int removeScanCountLimit) + : last_id_(0) { + init(capacity, num_shard_bits, removeScanCountLimit); + } + virtual ~ShardedLRUCache() { + delete[] shards_; + } + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter); + } + virtual Handle* Lookup(const Slice& key) { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)].Lookup(key, hash); + } + virtual void Release(Handle* handle) { + LRUHandle* h = reinterpret_cast(handle); + shards_[Shard(h->hash)].Release(handle); + } + virtual void Erase(const Slice& key) { + const uint32_t hash = HashSlice(key); + shards_[Shard(hash)].Erase(key, hash); + } + virtual void* Value(Handle* handle) { + return reinterpret_cast(handle)->value; + } + virtual uint64_t NewId() { + MutexLock l(&id_mutex_); + return ++(last_id_); + } + virtual size_t GetCapacity() const { + return capacity_; + } + + virtual size_t GetUsage() const { + // We will not lock the cache when getting the usage from shards. + // for (size_t i = 0; i < num_shard_bits_; ++i) + int num_shards = 1 << num_shard_bits_; + size_t usage = 0; + for (int s = 0; s < num_shards; s++) { + usage += shards_[s].GetUsage(); + } + return usage; + } + + virtual void DisownData() { + shards_ = nullptr; + } +}; + +} // end anonymous namespace + +shared_ptr NewLRUCache(size_t capacity) { + return NewLRUCache(capacity, kNumShardBits); +} + +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits) { + return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit); +} + +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits, + int removeScanCountLimit) { + if (num_shard_bits >= 20) { + return nullptr; // the cache cannot be sharded into too many fine pieces + } + return std::make_shared(capacity, + num_shard_bits, + removeScanCountLimit); +} + +} // namespace rocksdb diff --git a/util/cache_test.cc b/util/cache_test.cc new file mode 100644 index 00000000..b99f47b3 --- /dev/null +++ b/util/cache_test.cc @@ -0,0 +1,423 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cache.h" + +#include +#include +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + std::string result; + PutFixed32(&result, k); + return result; +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest { + public: + static CacheTest* current_; + + static void Deleter(const Slice& key, void* v) { + current_->deleted_keys_.push_back(DecodeKey(key)); + current_->deleted_values_.push_back(DecodeValue(v)); + } + + static const int kCacheSize = 1000; + static const int kNumShardBits = 4; + static const int kRemoveScanCountLimit = 16; + + static const int kCacheSize2 = 100; + static const int kNumShardBits2 = 2; + static const int kRemoveScanCountLimit2 = 200; + + std::vector deleted_keys_; + std::vector deleted_values_; + shared_ptr cache_; + shared_ptr cache2_; + + CacheTest() : + cache_(NewLRUCache(kCacheSize, kNumShardBits, kRemoveScanCountLimit)), + cache2_(NewLRUCache(kCacheSize2, kNumShardBits2, + kRemoveScanCountLimit2)) { + current_ = this; + } + + ~CacheTest() { + } + + int Lookup(shared_ptr cache, int key) { + Cache::Handle* handle = cache->Lookup(EncodeKey(key)); + const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle)); + if (handle != nullptr) { + cache->Release(handle); + } + return r; + } + + void Insert(shared_ptr cache, int key, int value, int charge = 1) { + cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); + } + + void Erase(shared_ptr cache, int key) { + cache->Erase(EncodeKey(key)); + } + + + int Lookup(int key) { + return Lookup(cache_, key); + } + + void Insert(int key, int value, int charge = 1) { + Insert(cache_, key, value, charge); + } + + void Erase(int key) { + Erase(cache_, key); + } + + int Lookup2(int key) { + return Lookup(cache2_, key); + } + + void Insert2(int key, int value, int charge = 1) { + Insert(cache2_, key, value, charge); + } + + void Erase2(int key) { + Erase(cache2_, key); + } +}; +CacheTest* CacheTest::current_; + +void dumbDeleter(const Slice& key, void* value) { } + +TEST(CacheTest, UsageTest) { + // cache is shared_ptr and will be automatically cleaned up. + const uint64_t kCapacity = 100000; + auto cache = NewLRUCache(kCapacity, 8, 200); + + size_t usage = 0; + const char* value = "abcdef"; + // make sure everything will be cached + for (int i = 1; i < 100; ++i) { + std::string key(i, 'a'); + auto kv_size = key.size() + 5; + cache->Release( + cache->Insert(key, (void*)value, kv_size, dumbDeleter) + ); + usage += kv_size; + ASSERT_EQ(usage, cache->GetUsage()); + } + + // make sure the cache will be overloaded + for (uint64_t i = 1; i < kCapacity; ++i) { + auto key = std::to_string(i); + cache->Release( + cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter) + ); + } + + // the usage should be close to the capacity + ASSERT_GT(kCapacity, cache->GetUsage()); + ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); +} + +TEST(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0U, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1U, deleted_keys_.size()); +} + +TEST(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0U, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1U, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2U, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + // Frequently used entry must be kept around + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000+i, 2000+i); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); +} + +TEST(CacheTest, EvictionPolicyRef) { + Insert(100, 101); + Insert(101, 102); + Insert(102, 103); + Insert(103, 104); + Insert(200, 101); + Insert(201, 102); + Insert(202, 103); + Insert(203, 104); + Cache::Handle* h201 = cache_->Lookup(EncodeKey(200)); + Cache::Handle* h202 = cache_->Lookup(EncodeKey(201)); + Cache::Handle* h203 = cache_->Lookup(EncodeKey(202)); + Cache::Handle* h204 = cache_->Lookup(EncodeKey(203)); + Insert(300, 101); + Insert(301, 102); + Insert(302, 103); + Insert(303, 104); + + // Insert entries much more than Cache capacity + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000 + i, 2000 + i); + } + + // Check whether the entries inserted in the beginning + // are evicted. Ones without extra ref are evicted and + // those with are not. + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(-1, Lookup(101)); + ASSERT_EQ(-1, Lookup(102)); + ASSERT_EQ(-1, Lookup(103)); + + ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(301)); + ASSERT_EQ(-1, Lookup(302)); + ASSERT_EQ(-1, Lookup(303)); + + ASSERT_EQ(101, Lookup(200)); + ASSERT_EQ(102, Lookup(201)); + ASSERT_EQ(103, Lookup(202)); + ASSERT_EQ(104, Lookup(203)); + + // Cleaning up all the handles + cache_->Release(h201); + cache_->Release(h202); + cache_->Release(h203); + cache_->Release(h204); +} + +TEST(CacheTest, EvictionPolicyRef2) { + std::vector handles; + + Insert(100, 101); + // Insert entries much more than Cache capacity + for (int i = 0; i < kCacheSize + 100; i++) { + Insert(1000 + i, 2000 + i); + if (i < kCacheSize ) { + handles.push_back(cache_->Lookup(EncodeKey(1000 + i))); + } + } + + // Make sure referenced keys are also possible to be deleted + // if there are not sufficient non-referenced keys + for (int i = 0; i < 5; i++) { + ASSERT_EQ(-1, Lookup(1000 + i)); + } + + for (int i = kCacheSize; i < kCacheSize + 100; i++) { + ASSERT_EQ(2000 + i, Lookup(1000 + i)); + } + ASSERT_EQ(-1, Lookup(100)); + + // Cleaning up all the handles + while (handles.size() > 0) { + cache_->Release(handles.back()); + handles.pop_back(); + } +} + +TEST(CacheTest, EvictionPolicyRefLargeScanLimit) { + std::vector handles2; + + // Cache2 has a cache RemoveScanCountLimit higher than cache size + // so it would trigger a boundary condition. + + // Populate the cache with 10 more keys than its size. + // Reference all keys except one close to the end. + for (int i = 0; i < kCacheSize2 + 10; i++) { + Insert2(1000 + i, 2000+i); + if (i != kCacheSize2 ) { + handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i))); + } + } + + // Make sure referenced keys are also possible to be deleted + // if there are not sufficient non-referenced keys + for (int i = 0; i < 3; i++) { + ASSERT_EQ(-1, Lookup2(1000 + i)); + } + // The non-referenced value is deleted even if it's accessed + // recently. + ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2)); + // Other values recently accessed are not deleted since they + // are referenced. + for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) { + if (i != kCacheSize2) { + ASSERT_EQ(2000 + i, Lookup2(1000 + i)); + } + } + + // Cleaning up all the handles + while (handles2.size() > 0) { + cache2_->Release(handles2.back()); + handles2.pop_back(); + } +} + + + +TEST(CacheTest, HeavyEntries) { + // Add a bunch of light and heavy entries and then count the combined + // size of items still in the cache, which must be approximately the + // same as the total capacity. + const int kLight = 1; + const int kHeavy = 10; + int added = 0; + int index = 0; + while (added < 2*kCacheSize) { + const int weight = (index & 1) ? kLight : kHeavy; + Insert(index, 1000+index, weight); + added += weight; + index++; + } + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + const int weight = (i & 1 ? kLight : kHeavy); + int r = Lookup(i); + if (r >= 0) { + cached_weight += weight; + ASSERT_EQ(1000+i, r); + } + } + ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10); +} + +TEST(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + + +class Value { + private: + int v_; + public: + explicit Value(int v) : v_(v) { } + + ~Value() { std::cout << v_ << " is destructed\n"; } +}; + +void deleter(const Slice& key, void* value) { + delete (Value *)value; +} + +TEST(CacheTest, BadEviction) { + int n = 10; + + // a LRUCache with n entries and one shard only + std::shared_ptr cache = NewLRUCache(n, 0); + + std::vector handles(n+1); + + // Insert n+1 entries, but not releasing. + for (int i = 0; i < n+1; i++) { + std::string key = std::to_string(i+1); + handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); + } + + // Guess what's in the cache now? + for (int i = 0; i < n+1; i++) { + std::string key = std::to_string(i+1); + auto h = cache->Lookup(key); + std::cout << key << (h?" found\n":" not found\n"); + // Only the first entry should be missing + ASSERT_TRUE(h || i == 0); + if (h) cache->Release(h); + } + + for (int i = 0; i < n+1; i++) { + cache->Release(handles[i]); + } + std::cout << "Poor entries\n"; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/coding.cc b/util/coding.cc new file mode 100644 index 00000000..31ae0e35 --- /dev/null +++ b/util/coding.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} + +const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value) { + assert((offset + bits + 7)/8 <= dstlen); + assert(bits <= 64); + + unsigned char* ptr = reinterpret_cast(dst); + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + + // This prevents unused variable warnings when compiling. +#ifndef NDEBUG + // Store truncated value. + uint64_t origValue = (bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value; + uint32_t origBits = bits; +#endif + + while (bits > 0) { + size_t bitsToGet = std::min(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + ptr[byteOffset] = (ptr[byteOffset] & ~(mask << bitOffset)) + + ((value & mask) << bitOffset); + + value >>= bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + assert(origValue == BitStreamGetInt(dst, dstlen, offset, origBits)); +} + +uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits) { + assert((offset + bits + 7)/8 <= srclen); + assert(bits <= 64); + + const unsigned char* ptr = reinterpret_cast(src); + + uint64_t result = 0; + + size_t byteOffset = offset / 8; + size_t bitOffset = offset % 8; + size_t shift = 0; + + while (bits > 0) { + size_t bitsToGet = std::min(bits, 8 - bitOffset); + unsigned char mask = ((1 << bitsToGet) - 1); + + result += (uint64_t)((ptr[byteOffset] >> bitOffset) & mask) << shift; + + shift += bitsToGet; + byteOffset += 1; + bitOffset = 0; + bits -= bitsToGet; + } + + return result; +} + +void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value) { + assert((offset + bits + 7)/8 <= dst->size()); + + const size_t kTmpBufLen = sizeof(value) + 1; + char tmpBuf[kTmpBufLen]; + + // Number of bytes of tmpBuf being used + const size_t kUsedBytes = (offset%8 + bits)/8; + + // Copy relevant parts of dst to tmpBuf + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + tmpBuf[idx] = (*dst)[offset/8 + idx]; + } + + BitStreamPutInt(tmpBuf, kTmpBufLen, offset%8, bits, value); + + // Copy tmpBuf back to dst + for (size_t idx = 0; idx <= kUsedBytes; ++idx) { + (*dst)[offset/8 + idx] = tmpBuf[idx]; + } + + // Do the check here too as we are working with a buffer. + assert(((bits < 64)?(value & (((uint64_t)1 << bits) - 1)):value) == + BitStreamGetInt(dst, offset, bits)); +} + +} // namespace rocksdb diff --git a/util/coding.h b/util/coding.h new file mode 100644 index 00000000..8ffba51c --- /dev/null +++ b/util/coding.h @@ -0,0 +1,294 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#pragma once +#include +#include +#include +#include + +#include "rocksdb/write_batch.h" +#include "port/port.h" + +namespace rocksdb { + +// The maximum length of a varint in bytes for 32 and 64 bits respectively. +const unsigned int kMaxVarint32Length = 5; +const unsigned int kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +// This function assumes data is well-formed. +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// nullptr on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); +extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(char* dst, uint32_t value); +extern void EncodeFixed64(char* dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, + const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, + const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// Writes an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered from 0 to 7 in the first byte, 8 to 15 in the second and +// so on. +// value is truncated to the bits number of least significant bits. +// REQUIRES: (offset+bits+7)/8 <= dstlen +// REQUIRES: bits <= 64 +extern void BitStreamPutInt(char* dst, size_t dstlen, size_t offset, + uint32_t bits, uint64_t value); + +// Reads an unsigned integer with bits number of bits with its least +// significant bit at offset. +// Bits are numbered in the same way as ByteStreamPutInt(). +// REQUIRES: (offset+bits+7)/8 <= srclen +// REQUIRES: bits <= 64 +extern uint64_t BitStreamGetInt(const char* src, size_t srclen, size_t offset, + uint32_t bits); + +// Convenience functions +extern void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits, + uint64_t value); +extern uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits); +extern uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits); + +// -- Implementation of the functions declared above +inline void EncodeFixed32(char* buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +inline void EncodeFixed64(char* buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[10]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + uint32_t total_bytes = 0; + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, total_bytes); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +inline uint64_t BitStreamGetInt(const std::string* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +inline uint64_t BitStreamGetInt(const Slice* src, size_t offset, + uint32_t bits) { + return BitStreamGetInt(src->data(), src->size(), offset, bits); +} + +} // namespace rocksdb diff --git a/util/coding_test.cc b/util/coding_test.cc new file mode 100644 index 00000000..ed542d6b --- /dev/null +++ b/util/coding_test.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class Coding { }; + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual = 0; + actual = DecodeFixed64(p); + ASSERT_EQ(v-1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v+1, actual); + p += sizeof(uint64_t); + } +} + +// Test that encoding routines generate little-endian encodings +TEST(Coding, EncodingOutput) { + std::string dst; + PutFixed32(&dst, 0x04030201); + ASSERT_EQ(4U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + + dst.clear(); + PutFixed64(&dst, 0x0807060504030201ull); + ASSERT_EQ(8U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + ASSERT_EQ(0x05, static_cast(dst[4])); + ASSERT_EQ(0x06, static_cast(dst[5])); + ASSERT_EQ(0x07, static_cast(dst[6])); + ASSERT_EQ(0x08, static_cast(dst[7])); +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual = 0; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power-1); + values.push_back(power+1); + }; + + std::string s; + for (unsigned int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (unsigned int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual = 0; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); + +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) + == nullptr); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (unsigned int len = 0; len < s.size() - 1; len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE( + GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +TEST(Coding, BitStream) { + const int kNumBytes = 10; + char bytes[kNumBytes+1]; + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + + // Simple byte aligned test. + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*8, 8, 255-i); + + ASSERT_EQ((unsigned char)bytes[i], (unsigned char)(255-i)); + } + for (int i = 0; i < kNumBytes; ++i) { + ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*8, 8), (uint32_t)(255-i)); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + // Write and read back at strange offsets + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*5+1, 4, (i * 7) % (1 << 4)); + } + for (int i = 0; i < kNumBytes; ++i) { + ASSERT_EQ(BitStreamGetInt(bytes, kNumBytes, i*5+1, 4), + (uint32_t)((i * 7) % (1 << 4))); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + // Create 11011011 as a bit pattern + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + for (int i = 0; i < kNumBytes; ++i) { + BitStreamPutInt(bytes, kNumBytes, i*8, 2, 3); + BitStreamPutInt(bytes, kNumBytes, i*8+3, 2, 3); + BitStreamPutInt(bytes, kNumBytes, i*8+6, 2, 3); + + ASSERT_EQ((unsigned char)bytes[i], + (unsigned char)(3 + (3 << 3) + (3 << 6))); + } + ASSERT_EQ(bytes[kNumBytes], '\0'); + + + // Test large values + for (int i = 0; i < kNumBytes + 1; ++i) { + bytes[i] = '\0'; + } + BitStreamPutInt(bytes, kNumBytes, 0, 64, (uint64_t)(-1)); + for (int i = 0; i < 64/8; ++i) { + ASSERT_EQ((unsigned char)bytes[i], + (unsigned char)(255)); + } + ASSERT_EQ(bytes[64/8], '\0'); + + +} + +TEST(Coding, BitStreamConvenienceFuncs) { + std::string bytes(1, '\0'); + + // Check that independent changes to byte are preserved. + BitStreamPutInt(&bytes, 0, 2, 3); + BitStreamPutInt(&bytes, 3, 2, 3); + BitStreamPutInt(&bytes, 6, 2, 3); + ASSERT_EQ((unsigned char)bytes[0], (unsigned char)(3 + (3 << 3) + (3 << 6))); + ASSERT_EQ(BitStreamGetInt(&bytes, 0, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&bytes, 3, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&bytes, 6, 2), 3u); + Slice slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 0, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&slice, 3, 2), 3u); + ASSERT_EQ(BitStreamGetInt(&slice, 6, 2), 3u); + + // Test overlapping crossing over byte boundaries + bytes = std::string(2, '\0'); + BitStreamPutInt(&bytes, 6, 4, 15); + ASSERT_EQ((unsigned char)bytes[0], 3 << 6); + ASSERT_EQ((unsigned char)bytes[1], 3); + ASSERT_EQ(BitStreamGetInt(&bytes, 6, 4), 15u); + slice = Slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 6, 4), 15u); + + // Test 64-bit number + bytes = std::string(64/8, '\0'); + BitStreamPutInt(&bytes, 0, 64, (uint64_t)(-1)); + ASSERT_EQ(BitStreamGetInt(&bytes, 0, 64), (uint64_t)(-1)); + slice = Slice(bytes); + ASSERT_EQ(BitStreamGetInt(&slice, 0, 64), (uint64_t)(-1)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/comparator.cc b/util/comparator.cc new file mode 100644 index 00000000..adeacac0 --- /dev/null +++ b/util/comparator.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/logging.h" + +namespace rocksdb { + +Comparator::~Comparator() { } + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.BytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return a.compare(b); + } + + virtual void FindShortestSeparator( + std::string* start, + const Slice& limit) const { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t diff_byte = static_cast((*start)[diff_index]); + if (diff_byte < static_cast(0xff) && + diff_byte + 1 < static_cast(limit[diff_index])) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + assert(Compare(*start, limit) < 0); + } + } + } + + virtual void FindShortSuccessor(std::string* key) const { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i+1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } +}; +} // namespace + +static port::OnceType once = LEVELDB_ONCE_INIT; +static const Comparator* bytewise; + +static void InitModule() { + bytewise = new BytewiseComparatorImpl; +} + +const Comparator* BytewiseComparator() { + port::InitOnce(&once, InitModule); + return bytewise; +} + +} // namespace rocksdb diff --git a/util/crc32c.cc b/util/crc32c.cc new file mode 100644 index 00000000..d27fb4be --- /dev/null +++ b/util/crc32c.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. + +#include "util/crc32c.h" + +#include +#ifdef __SSE4_2__ +#include +#endif +#include "util/coding.h" + +namespace rocksdb { +namespace crc32c { + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 +}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, + 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, + 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, + 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, + 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, + 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, + 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, + 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, + 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, + 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, + 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, + 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, + 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, + 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, + 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, + 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, + 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, + 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, + 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, + 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, + 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, + 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, + 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, + 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, + 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, + 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, + 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, + 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, + 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, + 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, + 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, + 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, + 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, + 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, + 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, + 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, + 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, + 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, + 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, + 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, + 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, + 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, + 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 +}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, + 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, + 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, + 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, + 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, + 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, + 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, + 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, + 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, + 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, + 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, + 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, + 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, + 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, + 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, + 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, + 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, + 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, + 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, + 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, + 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, + 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, + 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, + 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, + 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, + 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, + 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, + 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, + 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, + 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, + 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, + 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, + 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, + 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, + 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, + 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, + 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, + 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, + 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, + 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, + 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, + 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, + 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 +}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, + 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, + 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, + 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, + 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, + 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, + 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, + 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, + 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, + 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, + 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, + 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, + 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, + 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, + 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, + 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, + 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, + 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, + 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, + 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, + 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, + 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, + 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, + 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, + 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, + 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, + 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, + 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, + 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, + 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, + 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, + 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, + 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, + 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, + 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, + 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, + 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, + 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, + 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, + 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, + 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, + 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, + 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 +}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +#ifdef __SSE4_2__ +static inline uint64_t LE_LOAD64(const uint8_t *p) { + return DecodeFixed64(reinterpret_cast(p)); +} +#endif + +static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { + uint32_t c = *l ^ LE_LOAD32(*p); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; + // DO it twice. + c = *l ^ LE_LOAD32(*p); + *p += 4; + *l = table3_[c & 0xff] ^ + table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ + table0_[c >> 24]; +} + +static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { +#ifdef __SSE4_2__ + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; +#else + Slow_CRC32(l, p); +#endif +} + +template +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ +} while (0) + + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e-p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return l ^ 0xffffffffu; +} + +// Detect if SS42 or not. +static bool isSSE42() { +#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + uint32_t d_; + __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); + return c_ & (1U << 20); // copied from CpuId.h in Folly. +#else + return false; +#endif +} + +typedef uint32_t (*Function)(uint32_t, const char*, size_t); + +static inline Function Choose_Extend() { + return isSSE42() ? ExtendImpl : ExtendImpl; +} + +Function ChosenExtend = Choose_Extend(); + +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/util/crc32c.h b/util/crc32c.h new file mode 100644 index 00000000..e5e6e143 --- /dev/null +++ b/util/crc32c.h @@ -0,0 +1,46 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +namespace rocksdb { +namespace crc32c { + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { + return Extend(0, data, n); +} + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} // namespace crc32c +} // namespace rocksdb diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc new file mode 100644 index 00000000..300c9d3c --- /dev/null +++ b/util/crc32c_test.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/crc32c.h" +#include "util/testharness.h" + +namespace rocksdb { +namespace crc32c { + +class CRC { }; + +TEST(CRC, StandardResults) { + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = i; + } + ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = 31 - i; + } + ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x18, + 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); +} + +TEST(CRC, Values) { + ASSERT_NE(Value("a", 1), Value("foo", 3)); +} + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), + Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +} // namespace crc32c +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc new file mode 100644 index 00000000..a4c8e11c --- /dev/null +++ b/util/dynamic_bloom.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "dynamic_bloom.h" + +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +static uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} +} + +DynamicBloom::DynamicBloom(uint32_t total_bits, + uint32_t cl_per_block, + uint32_t num_probes, + uint32_t (*hash_func)(const Slice& key)) + : kBlocked(cl_per_block > 0), + kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8), + kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock + * kBitsPerBlock : + total_bits + 7) / 8 * 8), + kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1), + kNumProbes(num_probes), + hash_func_(hash_func == nullptr ? &BloomHash : hash_func) { + assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kBlocked) { + sz += CACHE_LINE_SIZE - 1; + } + raw_ = new unsigned char[sz](); + if (kBlocked && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { + data_ = raw_ + CACHE_LINE_SIZE - + reinterpret_cast(raw_) % CACHE_LINE_SIZE; + } else { + data_ = raw_; + } +} + +} // rocksdb diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h new file mode 100644 index 00000000..efc461cf --- /dev/null +++ b/util/dynamic_bloom.h @@ -0,0 +1,101 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include + +namespace rocksdb { + +class Slice; + +class DynamicBloom { + public: + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // cl_per_block: block size in cache lines. When this is non-zero, a + // query/set is done within a block to improve cache locality. + // hash_func: customized hash function + explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0, + uint32_t num_probes = 6, + uint32_t (*hash_func)(const Slice& key) = nullptr); + + ~DynamicBloom() { + delete[] raw_; + } + + // Assuming single threaded access to this function. + void Add(const Slice& key); + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContain(const Slice& key); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash); + + private: + const bool kBlocked; + const uint32_t kBitsPerBlock; + const uint32_t kTotalBits; + const uint32_t kNumBlocks; + const uint32_t kNumProbes; + + uint32_t (*hash_func_)(const Slice& key); + unsigned char* data_; + unsigned char* raw_; +}; + +inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); } + +inline bool DynamicBloom::MayContain(const Slice& key) { + return (MayContainHash(hash_func_(key))); +} + +inline bool DynamicBloom::MayContainHash(uint32_t h) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kBlocked) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock; + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = b + h % kBitsPerBlock; + if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + } + return true; +} + +inline void DynamicBloom::AddHash(uint32_t h) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (kBlocked) { + uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock; + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = b + h % kBitsPerBlock; + data_[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } + } else { + for (uint32_t i = 0; i < kNumProbes; ++i) { + const uint32_t bitpos = h % kTotalBits; + data_[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } + } +} + +} // rocksdb diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc new file mode 100644 index 00000000..4a34d509 --- /dev/null +++ b/util/dynamic_bloom_test.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#define __STDC_FORMAT_MACROS +#include +#include +#include + +#include "dynamic_bloom.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/stop_watch.h" + +DEFINE_int32(bits_per_key, 10, ""); +DEFINE_int32(num_probes, 6, ""); +DEFINE_bool(enable_perf, false, ""); + +namespace rocksdb { + +static Slice Key(uint64_t i, char* buffer) { + memcpy(buffer, &i, sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +class DynamicBloomTest { +}; + +TEST(DynamicBloomTest, EmptyFilter) { + DynamicBloom bloom1(100, 0, 2); + ASSERT_TRUE(!bloom1.MayContain("hello")); + ASSERT_TRUE(!bloom1.MayContain("world")); + + DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + ASSERT_TRUE(!bloom2.MayContain("hello")); + ASSERT_TRUE(!bloom2.MayContain("world")); +} + +TEST(DynamicBloomTest, Small) { + DynamicBloom bloom1(100, 0, 2); + bloom1.Add("hello"); + bloom1.Add("world"); + ASSERT_TRUE(bloom1.MayContain("hello")); + ASSERT_TRUE(bloom1.MayContain("world")); + ASSERT_TRUE(!bloom1.MayContain("x")); + ASSERT_TRUE(!bloom1.MayContain("foo")); + + DynamicBloom bloom2(CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2); + bloom2.Add("hello"); + bloom2.Add("world"); + ASSERT_TRUE(bloom2.MayContain("hello")); + ASSERT_TRUE(bloom2.MayContain("world")); + ASSERT_TRUE(!bloom2.MayContain("x")); + ASSERT_TRUE(!bloom2.MayContain("foo")); +} + +static uint32_t NextNum(uint32_t num) { + if (num < 10) { + num += 1; + } else if (num < 100) { + num += 10; + } else if (num < 1000) { + num += 100; + } else { + num += 1000; + } + return num; +} + +TEST(DynamicBloomTest, VaryingLengths) { + char buffer[sizeof(uint64_t)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + uint32_t num_probes = static_cast(FLAGS_num_probes); + + fprintf(stderr, "bits_per_key: %d num_probes: %d\n", + FLAGS_bits_per_key, num_probes); + + for (uint32_t cl_per_block = 0; cl_per_block < num_probes; + ++cl_per_block) { + for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { + uint32_t bloom_bits = 0; + if (cl_per_block == 0) { + bloom_bits = std::max(num * FLAGS_bits_per_key, 64U); + } else { + bloom_bits = std::max(num * FLAGS_bits_per_key, + cl_per_block * CACHE_LINE_SIZE * 8); + } + DynamicBloom bloom(bloom_bits, cl_per_block, num_probes); + for (uint64_t i = 0; i < num; i++) { + bloom.Add(Key(i, buffer)); + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); + } + + // All added keys must match + for (uint64_t i = 0; i < num; i++) { + ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) + << "Num " << num << "; key " << i; + } + + // Check false positive rate + + int result = 0; + for (uint64_t i = 0; i < 10000; i++) { + if (bloom.MayContain(Key(i + 1000000000, buffer))) { + result++; + } + } + double rate = result / 10000.0; + + fprintf(stderr, "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, " + "cl per block = %u\n", rate*100.0, num, bloom_bits, cl_per_block); + + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + ASSERT_LE(mediocre_filters, good_filters/5); + } +} + +TEST(DynamicBloomTest, perf) { + StopWatchNano timer(Env::Default()); + uint32_t num_probes = static_cast(FLAGS_num_probes); + + if (!FLAGS_enable_perf) { + return; + } + + for (uint64_t m = 1; m <= 8; ++m) { + const uint64_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8); + + DynamicBloom std_bloom(num_keys * 10, 0, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + std_bloom.Add(Slice(reinterpret_cast(&i), 8)); + } + + uint64_t elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n", + elapsed / num_keys); + + uint64_t count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (std_bloom.MayContain(Slice(reinterpret_cast(&i), 8))) { + ++count; + } + } + elapsed = timer.ElapsedNanos(); + fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n", + elapsed / count); + ASSERT_TRUE(count == num_keys); + + for (uint32_t cl_per_block = 1; cl_per_block <= num_probes; + ++cl_per_block) { + DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + blocked_bloom.Add(Slice(reinterpret_cast(&i), 8)); + } + + uint64_t elapsed = timer.ElapsedNanos(); + fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n", + cl_per_block, elapsed / num_keys); + + uint64_t count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (blocked_bloom.MayContain( + Slice(reinterpret_cast(&i), 8))) { + ++count; + } + } + + elapsed = timer.ElapsedNanos(); + fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n", + cl_per_block, elapsed / count); + ASSERT_TRUE(count == num_keys); + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + + return rocksdb::test::RunAllTests(); +} diff --git a/util/env.cc b/util/env.cc new file mode 100644 index 00000000..f2ebfcd5 --- /dev/null +++ b/util/env.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env.h" + +#include +#include "rocksdb/options.h" +#include "util/arena.h" +#include "util/autovector.h" + +namespace rocksdb { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +Logger::~Logger() { +} + +FileLock::~FileLock() { +} + +void LogFlush(Logger *info_log) { + if (info_log) { + info_log->Flush(); + } +} + +void Log(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO, format, ap); + va_end(ap); + } +} + +void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, + ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(log_level, format, ap); + va_end(ap); + } +} + +void Debug(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::DEBUG, format, ap); + va_end(ap); + } +} + +void Info(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO, format, ap); + va_end(ap); + } +} + +void Warn(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::WARN, format, ap); + va_end(ap); + } +} +void Error(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::ERROR, format, ap); + va_end(ap); + } +} +void Fatal(Logger* info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::FATAL, format, ap); + va_end(ap); + } +} + +void LogFlush(const shared_ptr& info_log) { + if (info_log) { + info_log->Flush(); + } +} + +void Log(const InfoLogLevel log_level, const shared_ptr& info_log, + const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(log_level, format, ap); + va_end(ap); + } +} + +void Debug(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::DEBUG, format, ap); + va_end(ap); + } +} + +void Info(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO, format, ap); + va_end(ap); + } +} + +void Warn(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::WARN, format, ap); + va_end(ap); + } +} + +void Error(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::ERROR, format, ap); + va_end(ap); + } +} + +void Fatal(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::FATAL, format, ap); + va_end(ap); + } +} + +void Log(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->Logv(InfoLogLevel::INFO, format, ap); + va_end(ap); + } +} + +static Status DoWriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync) { + unique_ptr file; + EnvOptions soptions; + Status s = env->NewWritableFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } + if (!s.ok()) { + env->DeleteFile(fname); + } + return s; +} + +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, false); +} + +Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, true); +} + +Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { + EnvOptions soptions; + data->clear(); + unique_ptr file; + Status s = env->NewSequentialFile(fname, &file, soptions); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + char* space = new char[kBufferSize]; + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, space); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + delete[] space; + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +namespace { // anonymous namespace + +void AssignEnvOptions(EnvOptions* env_options, const Options& options) { + env_options->use_os_buffer = options.allow_os_buffer; + env_options->use_mmap_reads = options.allow_mmap_reads; + env_options->use_mmap_writes = options.allow_mmap_writes; + env_options->set_fd_cloexec = options.is_fd_close_on_exec; + env_options->bytes_per_sync = options.bytes_per_sync; +} + +} + +EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions::EnvOptions(const Options& options) { + AssignEnvOptions(this, options); +} + +EnvOptions::EnvOptions() { + Options options; + AssignEnvOptions(this, options); +} + + +} // namespace rocksdb diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc new file mode 100644 index 00000000..c724b230 --- /dev/null +++ b/util/env_hdfs.cc @@ -0,0 +1,523 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifdef USE_HDFS +#ifndef ROCKSDB_HDFS_FILE_C +#define ROCKSDB_HDFS_FILE_C + +#include +#include +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "hdfs/hdfs.h" +#include "hdfs/env_hdfs.h" + +// +// This file defines an HDFS environment for rocksdb. It uses the libhdfs +// api to access HDFS. All HDFS files created by one instance of rocksdb +// will reside on the same HDFS cluster. +// + +namespace rocksdb { + +namespace { + +// Log error message +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +// assume that there is one global logger for now. It is not thread-safe, +// but need not be because the logger is initialized at db-open time. +static Logger* mylog = nullptr; + +// Used for reading a file from HDFS. It implements both sequential-read +// access methods as well as random read access methods. +class HdfsReadableFile: virtual public SequentialFile, virtual public RandomAccessFile { + private: + hdfsFS fileSys_; + std::string filename_; + hdfsFile hfile_; + + public: + HdfsReadableFile(hdfsFS fileSys, const std::string& fname) + : fileSys_(fileSys), filename_(fname), hfile_(nullptr) { + Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n", + filename_.c_str()); + hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0); + Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n", + filename_.c_str(), hfile_); + } + + virtual ~HdfsReadableFile() { + Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n", + filename_.c_str()); + hdfsCloseFile(fileSys_, hfile_); + Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n", + filename_.c_str()); + hfile_ = nullptr; + } + + bool isValid() { + return hfile_ != nullptr; + } + + // sequential access, read data at current offset in file + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n", + filename_.c_str(), n); + size_t bytes_read = hdfsRead(fileSys_, hfile_, scratch, (tSize)n); + Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str()); + *result = Slice(scratch, bytes_read); + if (bytes_read < n) { + if (feof()) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + return s; + } + + // random access, read data from specified offset in file + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str()); + ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset, + (void*)scratch, (tSize)n); + Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str()); + *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read); + if (bytes_read < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Skip(uint64_t n) { + Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str()); + // get current offset from file + tOffset current = hdfsTell(fileSys_, hfile_); + if (current < 0) { + return IOError(filename_, errno); + } + // seek to new offset in file + tOffset newoffset = current + n; + int val = hdfsSeek(fileSys_, hfile_, newoffset); + if (val < 0) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + private: + + // returns true if we are at the end of file, false otherwise + bool feof() { + Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str()); + if (hdfsTell(fileSys_, hfile_) == fileSize()) { + return true; + } + return false; + } + + // the current size of the file + tOffset fileSize() { + Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str()); + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str()); + tOffset size = 0L; + if (pFileInfo != nullptr) { + size = pFileInfo->mSize; + hdfsFreeFileInfo(pFileInfo, 1); + } else { + throw rocksdb::HdfsFatalException("fileSize on unknown file " + + filename_); + } + return size; + } +}; + +// Appends to an existing file in HDFS. +class HdfsWritableFile: public WritableFile { + private: + hdfsFS fileSys_; + std::string filename_; + hdfsFile hfile_; + + public: + HdfsWritableFile(hdfsFS fileSys, const std::string& fname) + : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) { + Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str()); + hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0); + Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str()); + assert(hfile_ != nullptr); + } + virtual ~HdfsWritableFile() { + if (hfile_ != nullptr) { + Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + hdfsCloseFile(fileSys_, hfile_); + Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + hfile_ = nullptr; + } + } + + // If the file was successfully created, then this returns true. + // Otherwise returns false. + bool isValid() { + return hfile_ != nullptr; + } + + // The name of the file, mostly needed for debug logging. + const std::string& getName() { + return filename_; + } + + virtual Status Append(const Slice& data) { + Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str()); + const char* src = data.data(); + size_t left = data.size(); + size_t ret = hdfsWrite(fileSys_, hfile_, src, left); + Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str()); + if (ret != left) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + Status s; + Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str()); + if (hdfsFlush(fileSys_, hfile_) == -1) { + return IOError(filename_, errno); + } + if (hdfsSync(fileSys_, hfile_) == -1) { + return IOError(filename_, errno); + } + Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str()); + return Status::OK(); + } + + // This is used by HdfsLogger to write data to the debug log file + virtual Status Append(const char* src, size_t size) { + if (hdfsWrite(fileSys_, hfile_, src, size) != (tSize)size) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status Close() { + Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + if (hdfsCloseFile(fileSys_, hfile_) != 0) { + return IOError(filename_, errno); + } + Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + hfile_ = nullptr; + return Status::OK(); + } +}; + +// The object that implements the debug logs to reside in HDFS. +class HdfsLogger : public Logger { + private: + HdfsWritableFile* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + + public: + HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)(), + const InfoLogLevel log_level = InfoLogLevel::ERROR) + : Logger(log_level), file_(f), gettid_(gettid) { + Log(mylog, "[hdfs] HdfsLogger opened %s\n", + file_->getName().c_str()); + } + + virtual ~HdfsLogger() { + Log(mylog, "[hdfs] HdfsLogger closed %s\n", + file_->getName().c_str()); + delete file_; + if (mylog != nullptr && mylog == this) { + mylog = nullptr; + } + } + + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + file_->Append(base, p-base); + file_->Flush(); + if (base != buffer) { + delete[] base; + } + break; + } + } +}; + +} // namespace + +// Finally, the hdfs environment + +// open a file for sequential reading +Status HdfsEnv::NewSequentialFile(const std::string& fname, + SequentialFile** result) { + HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } + *result = dynamic_cast(f); + return Status::OK(); +} + +// open a file for random reading +Status HdfsEnv::NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } + *result = dynamic_cast(f); + return Status::OK(); +} + +// create a new file for writing +Status HdfsEnv::NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + *result = nullptr; + return IOError(fname, errno); + } + *result = dynamic_cast(f); + return Status::OK(); +} + +Status HdfsEnv::NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv"); +} + +virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + return Status::NotSupported("NewDirectory not yet supported on HdfsEnv"); +} + +bool HdfsEnv::FileExists(const std::string& fname) { + int value = hdfsExists(fileSys_, fname.c_str()); + if (value == 0) { + return true; + } + return false; +} + +Status HdfsEnv::GetChildren(const std::string& path, + std::vector* result) { + int value = hdfsExists(fileSys_, path.c_str()); + switch (value) { + case 0: { + int numEntries = 0; + hdfsFileInfo* pHdfsFileInfo = 0; + pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries); + if (numEntries >= 0) { + for(int i = 0; i < numEntries; i++) { + char* pathname = pHdfsFileInfo[i].mName; + char* filename = rindex(pathname, '/'); + if (filename != nullptr) { + result->push_back(filename+1); + } + } + if (pHdfsFileInfo != nullptr) { + hdfsFreeFileInfo(pHdfsFileInfo, numEntries); + } + } else { + // numEntries < 0 indicates error + Log(mylog, "hdfsListDirectory call failed with error "); + throw HdfsFatalException("hdfsListDirectory call failed negative error.\n"); + } + break; + } + case 1: // directory does not exist, exit + break; + default: // anything else should be an error + Log(mylog, "hdfsListDirectory call failed with error "); + throw HdfsFatalException("hdfsListDirectory call failed with error.\n"); + } + return Status::OK(); +} + +Status HdfsEnv::DeleteFile(const std::string& fname) { + if (hdfsDelete(fileSys_, fname.c_str()) == 0) { + return Status::OK(); + } + return IOError(fname, errno); +}; + +Status HdfsEnv::CreateDir(const std::string& name) { + if (hdfsCreateDirectory(fileSys_, name.c_str()) == 0) { + return Status::OK(); + } + return IOError(name, errno); +}; + +Status HdfsEnv::CreateDirIfMissing(const std::string& name) { + const int value = hdfsExists(fileSys_, name.c_str()); + // Not atomic. state might change b/w hdfsExists and CreateDir. + if (value == 0) { + return Status::OK(); + } else { + return CreateDir(name); + } +}; + +Status HdfsEnv::DeleteDir(const std::string& name) { + return DeleteFile(name); +}; + +Status HdfsEnv::GetFileSize(const std::string& fname, uint64_t* size) { + *size = 0L; + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str()); + if (pFileInfo != nullptr) { + *size = pFileInfo->mSize; + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(fname, errno); +} + +Status HdfsEnv::GetFileModificationTime(const std::string& fname, + uint64_t* time) { + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, fname.c_str()); + if (pFileInfo != nullptr) { + *time = static_cast(pFileInfo->mLastMod); + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(fname, errno); + +} + +// The rename is not atomic. HDFS does not allow a renaming if the +// target already exists. So, we delete the target before attemting the +// rename. +Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) { + hdfsDelete(fileSys_, target.c_str()); + if (hdfsRename(fileSys_, src.c_str(), target.c_str()) == 0) { + return Status::OK(); + } + return IOError(src, errno); +} + +Status HdfsEnv::LockFile(const std::string& fname, FileLock** lock) { + // there isn's a very good way to atomically check and create + // a file via libhdfs + *lock = nullptr; + return Status::OK(); +} + +Status HdfsEnv::UnlockFile(FileLock* lock) { + return Status::OK(); +} + +Status HdfsEnv::NewLogger(const std::string& fname, + shared_ptr* result) { + HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname); + if (f == nullptr || !f->isValid()) { + *result = nullptr; + return IOError(fname, errno); + } + HdfsLogger* h = new HdfsLogger(f, &HdfsEnv::gettid); + *result = h; + if (mylog == nullptr) { + // mylog = h; // uncomment this for detailed logging + } + return Status::OK(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_HDFS_FILE_C + +#else // USE_HDFS + +// dummy placeholders used when HDFS is not available +#include "rocksdb/env.h" +#include "hdfs/env_hdfs.h" +namespace rocksdb { + Status HdfsEnv::NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::NotSupported("Not compiled with hdfs support"); + } +} + +#endif diff --git a/util/env_posix.cc b/util/env_posix.cc new file mode 100644 index 00000000..bce9526a --- /dev/null +++ b/util/env_posix.cc @@ -0,0 +1,1643 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#endif +#include +#include +#include +#include +#if defined(OS_LINUX) +#include +#include +#endif +#if defined(LEVELDB_PLATFORM_ANDROID) +#include +#endif +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/posix_logger.h" +#include "util/random.h" +#include + +// Get nano time for mach systems +#ifdef __MACH__ +#include +#include +#endif + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +// For non linux platform, the following macros are used only as place +// holder. +#ifndef OS_LINUX +#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ +#endif + +// This is only set from db_stress.cc and for testing only. +// If non-zero, kill at various points in source code with probability 1/this +int rocksdb_kill_odds = 0; + +namespace rocksdb { + +namespace { + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return Status::NotSupport. +int Fadvise(int fd, off_t offset, size_t len, int advice) { +#ifdef OS_LINUX + return posix_fadvise(fd, offset, len, advice); +#else + return 0; // simply do nothing. +#endif +} + +// list of pathnames that are locked +static std::set lockedFiles; +static port::Mutex mutex_lockedFiles; + +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM(rocksdb_kill_odds) +#else + +// Kill the process with probablity 1/odds for testing. +static void TestKillRandom(int odds, const std::string& srcfile, + int srcline) { + time_t curtime = time(nullptr); + Random r((uint32_t)curtime); + + assert(odds > 0); + bool crash = r.OneIn(odds); + if (crash) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); + } +} + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +#define TEST_KILL_RANDOM(rocksdb_kill_odds) { \ + if (rocksdb_kill_odds > 0) { \ + TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ + } \ +} + +#endif + +#if defined(OS_LINUX) +namespace { + static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length*3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd, FS_IOC_GETVERSION, &version); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast(rid-id); + } +} +#endif + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + int fd_; + bool use_os_buffer_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f, + const EnvOptions& options) + : filename_(fname), file_(f), fd_(fileno(f)), + use_os_buffer_(options.use_os_buffer) { + } + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = 0; + do { + r = fread_unlocked(scratch, 1, n, file_); + } while (r == 0 && ferror(file_) && errno == EINTR); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (fseek(file_, n, SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// pread() based random-access +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + bool use_os_buffer_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd, + const EnvOptions& options) + : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { + assert(!options.use_mmap_reads); + } + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = -1; + do { + r = pread(fd_, scratch, n, static_cast(offset)); + } while (r < 0 && errno == EINTR); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; + } + +#ifdef OS_LINUX + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); + } +#endif + + virtual void Hint(AccessPattern pattern) { + switch(pattern) { + case NORMAL: + Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case RANDOM: + Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case SEQUENTIAL: + Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case WILLNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case DONTNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// mmap() based random-access +class PosixMmapReadableFile: public RandomAccessFile { + private: + int fd_; + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + // base[0,length-1] contains the mmapped contents of the file. + PosixMmapReadableFile(const int fd, const std::string& fname, + void* base, size_t length, + const EnvOptions& options) + : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { + fd_ = fd_ + 0; // suppress the warning for used variables + assert(options.use_mmap_reads); + assert(options.use_os_buffer); + } + virtual ~PosixMmapReadableFile() { + int ret = munmap(mmapped_region_, length_); + if (ret != 0) { + fprintf(stdout, "failed to munmap %p length %zu \n", + mmapped_region_, length_); + } + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset + n > length_) { + *result = Slice(); + s = IOError(filename_, EINVAL); + } else { + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + } + return s; + } + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } +}; + +// We preallocate up to an extra megabyte and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file + // Have we done an munmap of unsynced data? + bool pending_sync_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool fallocate_with_keep_size_; +#endif + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { + return ((x + y - 1) / y) * y; + } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + bool UnmapCurrentRegion() { + bool result = true; + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (base_ != nullptr) { + if (last_sync_ < limit_) { + // Defer syncing this data until next Sync() call, if any + pending_sync_ = true; + } + if (munmap(base_, limit_ - base_) != 0) { + result = false; + } + file_offset_ += limit_ - base_; + base_ = nullptr; + limit_ = nullptr; + last_sync_ = nullptr; + dst_ = nullptr; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1<<20)) { + map_size_ *= 2; + } + } + return result; + } + + Status MapNewRegion() { +#ifdef ROCKSDB_FALLOCATE_PRESENT + assert(base_ == nullptr); + + TEST_KILL_RANDOM(rocksdb_kill_odds); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } + if (alloc_status != 0) { + return Status::IOError("Error allocating space to file : " + filename_ + + "Error : " + strerror(alloc_status)); + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, + fd_, file_offset_); + if (ptr == MAP_FAILED) { + return Status::IOError("MMap failed on " + filename_); + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return Status::OK(); +#else + return Status::NotSupported("This platform doesn't support fallocate()"); +#endif + } + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(nullptr), + limit_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert((page_size & (page_size - 1)) == 0); + assert(options.use_mmap_writes); + } + + + ~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); + PrepareWrite(GetFileSize(), left); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + if (UnmapCurrentRegion()) { + Status s = MapNewRegion(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + return Status::OK(); + } + + virtual Status Close() { + Status s; + size_t unused = limit_ - dst_; + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (!UnmapCurrentRegion()) { + s = IOError(filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError(filename_, errno); + } + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + base_ = nullptr; + limit_ = nullptr; + return s; + } + + virtual Status Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + return Status::OK(); + } + + virtual Status Sync() { + Status s; + + if (pending_sync_) { + // Some unmapped data was not synced + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (fdatasync(fd_) < 0) { + s = IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); + } + + if (dst_ > last_sync_) { + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + s = IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + + return s; + } + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() { + if (pending_sync_) { + // Some unmapped data was not synced + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + } + // This invocation to Sync will not issue the call to + // fdatasync because pending_sync_ has already been cleared. + return Sync(); + } + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + virtual uint64_t GetFileSize() { + size_t used = dst_ - base_; + return file_offset_ + used; + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } +#endif +}; + +// Use posix write to write data to a file. +class PosixWritableFile : public WritableFile { + private: + const std::string filename_; + int fd_; + size_t cursize_; // current size of cached data in buf_ + size_t capacity_; // max size of buf_ + unique_ptr buf_; // a buffer to cache writes + uint64_t filesize_; + bool pending_sync_; + bool pending_fsync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool fallocate_with_keep_size_; +#endif + + public: + PosixWritableFile(const std::string& fname, int fd, size_t capacity, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + cursize_(0), + capacity_(capacity), + buf_(new char[capacity]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert(!options.use_mmap_writes); + } + + ~PosixWritableFile() { + if (fd_ >= 0) { + PosixWritableFile::Close(); + } + } + + virtual Status Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + + PrepareWrite(GetFileSize(), left); + // if there is no space in the cache, then flush + if (cursize_ + left > capacity_) { + s = Flush(); + if (!s.ok()) { + return s; + } + // Increase the buffer size, but capped at 1MB + if (capacity_ < (1<<20)) { + capacity_ *= 2; + buf_.reset(new char[capacity_]); + } + assert(cursize_ == 0); + } + + // if the write fits into the cache, then write to cache + // otherwise do a write() syscall to write to OS buffers. + if (cursize_ + left <= capacity_) { + memcpy(buf_.get()+cursize_, src, left); + cursize_ += left; + } else { + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + + left -= done; + src += done; + } + } + filesize_ += data.size(); + return Status::OK(); + } + + virtual Status Close() { + Status s; + s = Flush(); // flush cache to OS + if (!s.ok()) { + return s; + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + + size_t block_size; + size_t last_allocated_block; + GetPreallocationStatus(&block_size, &last_allocated_block); + if (last_allocated_block > 0) { + // trim the extra space preallocated at the end of the file + int dummy __attribute__((unused)); + dummy = ftruncate(fd_, filesize_); // ignore errors + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + fd_ = -1; + return s; + } + + // write out the cached data to the OS cache + virtual Status Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + size_t left = cursize_; + char* src = buf_.get(); + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + left -= done; + src += done; + } + cursize_ = 0; + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + if (bytes_per_sync_ && + filesize_ - last_sync_size_ >= bytes_per_sync_) { + RangeSync(last_sync_size_, filesize_ - last_sync_size_); + last_sync_size_ = filesize_; + } + + return Status::OK(); + } + + virtual Status Sync() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_sync_ && fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + return Status::OK(); + } + + virtual Status Fsync() { + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_fsync_ && fsync(fd_) < 0) { + return IOError(filename_, errno); + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_fsync_ = false; + pending_sync_ = false; + return Status::OK(); + } + + virtual uint64_t GetFileSize() { + return filesize_; + } + + virtual Status InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } + + virtual Status RangeSync(off64_t offset, off64_t nbytes) { + if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } + virtual size_t GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); + } +#endif +}; + +class PosixRandomRWFile : public RandomRWFile { + private: + const std::string filename_; + int fd_; + bool pending_sync_; + bool pending_fsync_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool fallocate_with_keep_size_; +#endif + + public: + PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options) + : filename_(fname), + fd_(fd), + pending_sync_(false), + pending_fsync_(false) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert(!options.use_mmap_writes && !options.use_mmap_reads); + } + + ~PosixRandomRWFile() { + if (fd_ >= 0) { + Close(); + } + } + + virtual Status Write(uint64_t offset, const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + while (left != 0) { + ssize_t done = pwrite(fd_, src, left, offset); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + + left -= done; + src += done; + offset += done; + } + + return Status::OK(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Close() { + Status s = Status::OK(); + if (fd_ >= 0 && close(fd_) < 0) { + s = IOError(filename_, errno); + } + fd_ = -1; + return s; + } + + virtual Status Sync() { + if (pending_sync_ && fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + pending_sync_ = false; + return Status::OK(); + } + + virtual Status Fsync() { + if (pending_fsync_ && fsync(fd_) < 0) { + return IOError(filename_, errno); + } + pending_fsync_ = false; + pending_sync_ = false; + return Status::OK(); + } + +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } +#endif +}; + +class PosixDirectory : public Directory { + public: + explicit PosixDirectory(int fd) : fd_(fd) {} + ~PosixDirectory() { + close(fd_); + } + + virtual Status Fsync() { + if (fsync(fd_) == -1) { + return IOError("directory", errno); + } + return Status::OK(); + } + + private: + int fd_; +}; + +static int LockOrUnlock(const std::string& fname, int fd, bool lock) { + mutex_lockedFiles.Lock(); + if (lock) { + // If it already exists in the lockedFiles set, then it is already locked, + // and fail this lock attempt. Otherwise, insert it into lockedFiles. + // This check is needed because fcntl() does not detect lock conflict + // if the fcntl is issued by the same thread that earlier acquired + // this lock. + if (lockedFiles.insert(fname).second == false) { + mutex_lockedFiles.Unlock(); + errno = ENOLCK; + return -1; + } + } else { + // If we are unlocking, then verify that we had locked it earlier, + // it should already exist in lockedFiles. Remove it from lockedFiles. + if (lockedFiles.erase(fname) != 1) { + mutex_lockedFiles.Unlock(); + errno = ENOLCK; + return -1; + } + } + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + if (value == -1 && lock) { + // if there is an error in locking, then remove the pathname from lockedfiles + lockedFiles.erase(fname); + } + mutex_lockedFiles.Unlock(); + return value; +} + +class PosixFileLock : public FileLock { + public: + int fd_; + std::string filename; +}; + + +namespace { +void PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + exit(1); + } +} +} + +class PosixEnv : public Env { + public: + PosixEnv(); + + virtual ~PosixEnv(){ + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + FILE* f = nullptr; + do { + f = fopen(fname.c_str(), "r"); + } while (f == nullptr && errno == EINTR); + if (f == nullptr) { + *result = nullptr; + return IOError(fname, errno); + } else { + int fd = fileno(f); + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixSequentialFile(fname, f, options)); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + int fd = open(fname.c_str(), O_RDONLY); + SetFD_CLOEXEC(fd, &options); + if (fd < 0) { + s = IOError(fname, errno); + } else if (options.use_mmap_reads && sizeof(void*) >= 8) { + // Use of mmap for random reads has been removed because it + // kills performance when storage is fast. + // Use mmap when virtual address-space is plentiful. + uint64_t size; + s = GetFileSize(fname, &size); + if (s.ok()) { + void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); + if (base != MAP_FAILED) { + result->reset(new PosixMmapReadableFile(fd, fname, base, + size, options)); + } else { + s = IOError(fname, errno); + } + } + close(fd); + } else { + result->reset(new PosixRandomAccessFile(fname, fd, options)); + } + return s; + } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + int fd = -1; + do { + fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + if (options.use_mmap_writes) { + if (!checkedDiskForMmap_) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fname)) { + forceMmapOff = true; + } + checkedDiskForMmap_ = true; + } + } + if (options.use_mmap_writes && !forceMmapOff) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else { + // disable mmap writes + EnvOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + + result->reset( + new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options) + ); + } + } + return s; + } + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + result->reset(); + // no support for mmap yet + if (options.use_mmap_writes || options.use_mmap_reads) { + return Status::NotSupported("No support for mmap read/write yet"); + } + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + result->reset(new PosixRandomRWFile(fname, fd, options)); + } + return s; + } + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result) { + result->reset(); + const int fd = open(name.c_str(), 0); + if (fd < 0) { + return IOError(name, errno); + } else { + result->reset(new PosixDirectory(fd)); + } + return Status::OK(); + } + + virtual bool FileExists(const std::string& fname) { + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) { + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == nullptr) { + return IOError(dir, errno); + } + struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) { + Status result; + if (unlink(fname.c_str()) != 0) { + result = IOError(fname, errno); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status CreateDirIfMissing(const std::string& name) { + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + if (errno != EEXIST) { + result = IOError(name, errno); + } else if (!DirExists(name)) { // Check that name is actually a + // directory. + // Message is taken from mkdir + result = Status::IOError("`"+name+"' exists but is not a directory"); + } + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) { + Status result; + if (rmdir(name.c_str()) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) { + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + *size = 0; + s = IOError(fname, errno); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + struct stat s; + if (stat(fname.c_str(), &s) !=0) { + return IOError(fname, errno); + } + *file_mtime = static_cast(s.st_mtime); + return Status::OK(); + } + virtual Status RenameFile(const std::string& src, const std::string& target) { + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = IOError(src, errno); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) { + *lock = nullptr; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = IOError(fname, errno); + } else if (LockOrUnlock(fname, fd, true) == -1) { + result = IOError("lock " + fname, errno); + close(fd); + } else { + SetFD_CLOEXEC(fd, nullptr); + PosixFileLock* my_lock = new PosixFileLock; + my_lock->fd_ = fd; + my_lock->filename = fname; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) { + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) { + result = IOError("unlock", errno); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW); + + virtual void StartThread(void (*function)(void* arg), void* arg); + + virtual void WaitForJoin(); + + virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; + + virtual Status GetTestDirectory(std::string* result) { + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + *result = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid())); + *result = buf; + } + // Directory may already exist + CreateDir(*result); + return Status::OK(); + } + + static uint64_t gettid() { + pthread_t tid = pthread_self(); + uint64_t thread_id = 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); + return thread_id; + } + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result) { + FILE* f = fopen(fname.c_str(), "w"); + if (f == nullptr) { + result->reset(); + return IOError(fname, errno); + } else { + int fd = fileno(f); + SetFD_CLOEXEC(fd, nullptr); + result->reset(new PosixLogger(f, &PosixEnv::gettid, this)); + return Status::OK(); + } + } + + virtual uint64_t NowMicros() { + struct timeval tv; + // TODO(kailiu) MAC DON'T HAVE THIS + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual uint64_t NowNanos() { +#ifdef OS_LINUX + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif __MACH__ + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); +#endif + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; + } + + virtual void SleepForMicroseconds(int micros) { + usleep(micros); + } + + virtual Status GetHostName(char* name, uint64_t len) { + int ret = gethostname(name, len); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) + return Status::InvalidArgument(strerror(errno)); + else + return IOError("GetHostName", errno); + } + return Status::OK(); + } + + virtual Status GetCurrentTime(int64_t* unix_time) { + time_t ret = time(nullptr); + if (ret == (time_t) -1) { + return IOError("GetCurrentTime", errno); + } + *unix_time = (int64_t) ret; + return Status::OK(); + } + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + if (db_path.find('/') == 0) { + *output_path = db_path; + return Status::OK(); + } + + char the_path[256]; + char* ret = getcwd(the_path, 256); + if (ret == nullptr) { + return Status::IOError(strerror(errno)); + } + + *output_path = ret; + return Status::OK(); + } + + // Allow increasing the number of worker threads. + virtual void SetBackgroundThreads(int num, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); + } + + virtual std::string TimeToString(uint64_t secondsSince1970) { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + localtime_r(&seconds, &t); + snprintf(p, maxsize, + "%04d/%02d/%02d-%02d:%02d:%02d ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec); + return dummy; + } + + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it + // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit + // test and make this false + optimized.fallocate_with_keep_size = true; + return optimized; + } + + EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } + + private: + bool checkedDiskForMmap_; + bool forceMmapOff; // do we override Env options? + + + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname) { + struct stat statbuf; + if (stat(dname.c_str(), &statbuf) == 0) { + return S_ISDIR(statbuf.st_mode); + } + return false; // stat() failed return false + } + + bool SupportsFastAllocate(const std::string& path) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + struct statfs s; + if (statfs(path.c_str(), &s)){ + return false; + } + switch (s.f_type) { + case EXT4_SUPER_MAGIC: + return true; + case XFS_SUPER_MAGIC: + return true; + case TMPFS_MAGIC: + return true; + default: + return false; + } +#else + return false; +#endif + } + + size_t page_size_; + + + class ThreadPool { + public: + ThreadPool() + : total_threads_limit_(1), + bgthreads_(0), + queue_(), + queue_len_(0), + exit_all_threads_(false) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr)); + } + + ~ThreadPool() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + assert(!exit_all_threads_); + exit_all_threads_ = true; + PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_)); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + for (const auto tid : bgthreads_) { + pthread_join(tid, nullptr); + } + } + + void BGThread() { + while (true) { + // Wait until there is an item that is ready to run + PthreadCall("lock", pthread_mutex_lock(&mu_)); + while (queue_.empty() && !exit_all_threads_) { + PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_)); + } + if (exit_all_threads_) { // mechanism to let BG threads exit safely + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + break; + } + void (*function)(void*) = queue_.front().function; + void* arg = queue_.front().arg; + queue_.pop_front(); + queue_len_.store(queue_.size(), std::memory_order_relaxed); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + (*function)(arg); + } + } + + static void* BGThreadWrapper(void* arg) { + reinterpret_cast(arg)->BGThread(); + return nullptr; + } + + void SetBackgroundThreads(int num) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + if (num > total_threads_limit_) { + total_threads_limit_ = num; + } + assert(total_threads_limit_ > 0); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + } + + void Schedule(void (*function)(void*), void* arg) { + PthreadCall("lock", pthread_mutex_lock(&mu_)); + + if (exit_all_threads_) { + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + return; + } + // Start background thread if necessary + while ((int)bgthreads_.size() < total_threads_limit_) { + pthread_t t; + PthreadCall( + "create thread", + pthread_create(&t, + nullptr, + &ThreadPool::BGThreadWrapper, + this)); + + // Set the thread name to aid debugging +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + char name_buf[16]; + snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size()); + name_buf[sizeof name_buf - 1] = '\0'; + pthread_setname_np(t, name_buf); +#endif +#endif + + bgthreads_.push_back(t); + } + + // Add to priority queue + queue_.push_back(BGItem()); + queue_.back().function = function; + queue_.back().arg = arg; + queue_len_.store(queue_.size(), std::memory_order_relaxed); + + // always wake up at least one waiting thread. + PthreadCall("signal", pthread_cond_signal(&bgsignal_)); + + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + } + + unsigned int GetQueueLen() const { + return queue_len_.load(std::memory_order_relaxed); + } + + private: + // Entry per Schedule() call + struct BGItem { void* arg; void (*function)(void*); }; + typedef std::deque BGQueue; + + pthread_mutex_t mu_; + pthread_cond_t bgsignal_; + int total_threads_limit_; + std::vector bgthreads_; + BGQueue queue_; + std::atomic_uint queue_len_; // Queue length. Used for stats reporting + bool exit_all_threads_; + }; + + std::vector thread_pools_; + + pthread_mutex_t mu_; + std::vector threads_to_join_; + +}; + +PosixEnv::PosixEnv() : checkedDiskForMmap_(false), + forceMmapOff(false), + page_size_(getpagesize()), + thread_pools_(Priority::TOTAL) { + PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); +} + +void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].Schedule(function, arg); +} + +unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +namespace { +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; +} +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + PthreadCall("start thread", + pthread_create(&t, nullptr, &StartThreadWrapper, state)); + PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void PosixEnv::WaitForJoin() { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + threads_to_join_.clear(); +} + +} // namespace + +std::string Env::GenerateUniqueId() { + std::string uuid_file = "/proc/sys/kernel/random/uuid"; + if (FileExists(uuid_file)) { + std::string uuid; + Status s = ReadFileToString(this, uuid_file, &uuid); + if (s.ok()) { + return uuid; + } + } + // Could not read uuid_file - generate uuid using "nanos-random" + Random64 r(time(nullptr)); + uint64_t random_uuid_portion = + r.Uniform(std::numeric_limits::max()); + uint64_t nanos_uuid_portion = NowNanos(); + char uuid2[200]; + snprintf(uuid2, + 200, + "%lx-%lx", + (unsigned long)nanos_uuid_portion, + (unsigned long)random_uuid_portion); + return uuid2; +} + +Env* Env::Default() { + static PosixEnv default_env; + return &default_env; +} + +} // namespace rocksdb diff --git a/util/env_test.cc b/util/env_test.cc new file mode 100644 index 00000000..0a83037c --- /dev/null +++ b/util/env_test.cc @@ -0,0 +1,545 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include + +#ifdef OS_LINUX +#include +#include +#endif + +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/log_buffer.h" +#include "util/mutexlock.h" +#include "util/testharness.h" + +namespace rocksdb { + +static const int kDelayMicros = 100000; + +class EnvPosixTest { + private: + port::Mutex mu_; + std::string events_; + + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } +}; + +static void SetBool(void* ptr) { + reinterpret_cast(ptr)->NoBarrier_Store(ptr); +} + +TEST(EnvPosixTest, RunImmediately) { + port::AtomicPointer called (nullptr); + env_->Schedule(&SetBool, &called); + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_TRUE(called.NoBarrier_Load() != nullptr); +} + +TEST(EnvPosixTest, RunMany) { + port::AtomicPointer last_id (nullptr); + + struct CB { + port::AtomicPointer* last_id_ptr; // Pointer to shared slot + uintptr_t id; // Order# for the execution of this callback + + CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + void* cur = cb->last_id_ptr->NoBarrier_Load(); + ASSERT_EQ(cb->id-1, reinterpret_cast(cur)); + cb->last_id_ptr->Release_Store(reinterpret_cast(cb->id)); + } + }; + + // Schedule in different order than start time + CB cb1(&last_id, 1); + CB cb2(&last_id, 2); + CB cb3(&last_id, 3); + CB cb4(&last_id, 4); + env_->Schedule(&CB::Run, &cb1); + env_->Schedule(&CB::Run, &cb2); + env_->Schedule(&CB::Run, &cb3); + env_->Schedule(&CB::Run, &cb4); + + Env::Default()->SleepForMicroseconds(kDelayMicros); + void* cur = last_id.Acquire_Load(); + ASSERT_EQ(4U, reinterpret_cast(cur)); +} + +struct State { + port::Mutex mu; + int val; + int num_running; +}; + +static void ThreadBody(void* arg) { + State* s = reinterpret_cast(arg); + s->mu.Lock(); + s->val += 1; + s->num_running -= 1; + s->mu.Unlock(); +} + +TEST(EnvPosixTest, StartThread) { + State state; + state.val = 0; + state.num_running = 3; + for (int i = 0; i < 3; i++) { + env_->StartThread(&ThreadBody, &state); + } + while (true) { + state.mu.Lock(); + int num = state.num_running; + state.mu.Unlock(); + if (num == 0) { + break; + } + Env::Default()->SleepForMicroseconds(kDelayMicros); + } + ASSERT_EQ(state.val, 3); +} + +TEST(EnvPosixTest, TwoPools) { + + class CB { + public: + CB(const std::string& pool_name, int pool_size) + : mu_(), + num_running_(0), + num_finished_(0), + pool_size_(pool_size), + pool_name_(pool_name) { } + + static void Run(void* v) { + CB* cb = reinterpret_cast(v); + cb->Run(); + } + + void Run() { + { + MutexLock l(&mu_); + num_running_++; + std::cout << "Pool " << pool_name_ << ": " + << num_running_ << " running threads.\n"; + // make sure we don't have more than pool_size_ jobs running. + ASSERT_LE(num_running_, pool_size_); + } + + // sleep for 1 sec + Env::Default()->SleepForMicroseconds(1000000); + + { + MutexLock l(&mu_); + num_running_--; + num_finished_++; + } + } + + int NumFinished() { + MutexLock l(&mu_); + return num_finished_; + } + + private: + port::Mutex mu_; + int num_running_; + int num_finished_; + int pool_size_; + std::string pool_name_; + }; + + const int kLowPoolSize = 2; + const int kHighPoolSize = 4; + const int kJobs = 8; + + CB low_pool_job("low", kLowPoolSize); + CB high_pool_job("high", kHighPoolSize); + + env_->SetBackgroundThreads(kLowPoolSize); + env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH); + + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // schedule same number of jobs in each pool + for (int i = 0; i < kJobs; i++) { + env_->Schedule(&CB::Run, &low_pool_job); + env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH); + } + // Wait a short while for the jobs to be dispatched. + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen()); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ((unsigned int)(kJobs - kHighPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // wait for all jobs to finish + while (low_pool_job.NumFinished() < kJobs || + high_pool_job.NumFinished() < kJobs) { + env_->SleepForMicroseconds(kDelayMicros); + } + + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); +} + +bool IsSingleVarint(const std::string& s) { + Slice slice(s); + + uint64_t v; + if (!GetVarint64(&slice, &v)) { + return false; + } + + return slice.size() == 0; +} + +#ifdef OS_LINUX +// To make sure the Env::GetUniqueId() related tests work correctly, The files +// should be stored in regular storage like "hard disk" or "flash device". +// Otherwise we cannot get the correct id. +// +// The following function act as the replacement of test::TmpDir() that may be +// customized by user to be on a storage that doesn't work with GetUniqueId(). +// +// TODO(kailiu) This function still assumes /tmp/ reside in regular +// storage system. +bool IsUniqueIDValid(const std::string& s) { + return !s.empty() && !IsSingleVarint(s); +} + +const size_t MAX_ID_SIZE = 100; +char temp_id[MAX_ID_SIZE]; + +std::string GetOnDiskTestDir() { + char base[100]; + snprintf(base, sizeof(base), "/tmp/rocksdbtest-%d", + static_cast(geteuid())); + // Directory may already exist + Env::Default()->CreateDirIfMissing(base); + + return base; +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueID) { + // Create file. + const EnvOptions soptions; + std::string fname = GetOnDiskTestDir() + "/" + "testfile"; + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + unique_ptr file; + + // Get Unique ID + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id1(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id1)); + + // Get Unique ID again + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id2(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id2)); + + // Get Unique ID again after waiting some time. + env_->SleepForMicroseconds(1000000); + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + std::string unique_id3(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id3)); + + // Check IDs are the same. + ASSERT_EQ(unique_id1, unique_id2); + ASSERT_EQ(unique_id2, unique_id3); + + // Delete the file + env_->DeleteFile(fname); +} + +// only works in linux platforms +#ifdef ROCKSDB_FALLOCATE_PRESENT +TEST(EnvPosixTest, AllocateTest) { + std::string fname = GetOnDiskTestDir() + "/preallocate_testfile"; + EnvOptions soptions; + soptions.use_mmap_writes = false; + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + + // allocate 100 MB + size_t kPreallocateSize = 100 * 1024 * 1024; + size_t kBlockSize = 512; + size_t kPageSize = 4096; + std::string data = "test"; + wfile->SetPreallocationBlockSize(kPreallocateSize); + ASSERT_OK(wfile->Append(Slice(data))); + ASSERT_OK(wfile->Flush()); + + struct stat f_stat; + stat(fname.c_str(), &f_stat); + ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); + // verify that blocks are preallocated + ASSERT_EQ((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks); + + // close the file, should deallocate the blocks + wfile.reset(); + + stat(fname.c_str(), &f_stat); + ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); + // verify that preallocated blocks were deallocated on file close + size_t data_blocks_pages = ((data.size() + kPageSize - 1) / kPageSize); + ASSERT_EQ((unsigned int)(data_blocks_pages * kPageSize / kBlockSize), f_stat.st_blocks); +} +#endif + +// Returns true if any of the strings in ss are the prefix of another string. +bool HasPrefix(const std::unordered_set& ss) { + for (const std::string& s: ss) { + if (s.empty()) { + return true; + } + for (size_t i = 1; i < s.size(); ++i) { + if (ss.count(s.substr(0, i)) != 0) { + return true; + } + } + } + return false; +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) { + // Check whether a bunch of concurrently existing files have unique IDs. + const EnvOptions soptions; + + // Create the files + std::vector fnames; + for (int i = 0; i < 1000; ++i) { + fnames.push_back(GetOnDiskTestDir() + "/" + "testfile" + std::to_string(i)); + + // Create file. + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions)); + } + + // Collect and check whether the IDs are unique. + std::unordered_set ids; + for (const std::string fname: fnames) { + unique_ptr file; + std::string unique_id; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + } + + // Delete the files + for (const std::string fname: fnames) { + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +// Only works in linux platforms +TEST(EnvPosixTest, RandomAccessUniqueIDDeletes) { + const EnvOptions soptions; + + std::string fname = GetOnDiskTestDir() + "/" + "testfile"; + + // Check that after file is deleted we don't get same ID again in a new file. + std::unordered_set ids; + for (int i = 0; i < 1000; ++i) { + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + } + + // Get Unique ID + std::string unique_id; + { + unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE); + ASSERT_TRUE(id_size > 0); + unique_id = std::string(temp_id, id_size); + } + + ASSERT_TRUE(IsUniqueIDValid(unique_id)); + ASSERT_TRUE(ids.count(unique_id) == 0); + ids.insert(unique_id); + + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + } + + ASSERT_TRUE(!HasPrefix(ids)); +} + +// Only works in linux platforms +TEST(EnvPosixTest, InvalidateCache) { + const EnvOptions soptions; + std::string fname = test::TmpDir() + "/" + "testfile"; + + // Create file. + { + unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile.get()->Append(Slice("Hello world"))); + ASSERT_OK(wfile.get()->InvalidateCache(0, 0)); + ASSERT_OK(wfile.get()->Close()); + } + + // Random Read + { + unique_ptr file; + char scratch[100]; + Slice result; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file.get()->Read(0, 11, &result, scratch)); + ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); + ASSERT_OK(file.get()->InvalidateCache(0, 11)); + ASSERT_OK(file.get()->InvalidateCache(0, 0)); + } + + // Sequential Read + { + unique_ptr file; + char scratch[100]; + Slice result; + ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); + ASSERT_OK(file.get()->Read(11, &result, scratch)); + ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); + ASSERT_OK(file.get()->InvalidateCache(0, 11)); + ASSERT_OK(file.get()->InvalidateCache(0, 0)); + } + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); +} +#endif + +TEST(EnvPosixTest, PosixRandomRWFileTest) { + EnvOptions soptions; + soptions.use_mmap_writes = soptions.use_mmap_reads = false; + std::string fname = test::TmpDir() + "/" + "testfile"; + + unique_ptr file; + ASSERT_OK(env_->NewRandomRWFile(fname, &file, soptions)); + // If you run the unit test on tmpfs, then tmpfs might not + // support fallocate. It is still better to trigger that + // code-path instead of eliminating it completely. + file.get()->Allocate(0, 10*1024*1024); + ASSERT_OK(file.get()->Write(100, Slice("Hello world"))); + ASSERT_OK(file.get()->Write(105, Slice("Hello world"))); + ASSERT_OK(file.get()->Sync()); + ASSERT_OK(file.get()->Fsync()); + char scratch[100]; + Slice result; + ASSERT_OK(file.get()->Read(100, 16, &result, scratch)); + ASSERT_EQ(result.compare("HelloHello world"), 0); + ASSERT_OK(file.get()->Close()); +} + +class TestLogger : public Logger { + public: + virtual void Logv(const char* format, va_list ap) override { + log_count++; + + char new_format[550]; + std::fill_n(new_format, sizeof(new_format), '2'); + { + va_list backup_ap; + va_copy(backup_ap, ap); + int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap); + // 48 bytes for extra information + bytes allocated + + if (new_format[0] == '[') { + // "[DEBUG] " + ASSERT_TRUE(n <= 56 + (512 - static_cast(sizeof(struct timeval)))); + } else { + ASSERT_TRUE(n <= 48 + (512 - static_cast(sizeof(struct timeval)))); + } + va_end(backup_ap); + } + + for (size_t i = 0; i < sizeof(new_format); i++) { + if (new_format[i] == 'x') { + char_x_count++; + } else if (new_format[i] == '\0') { + char_0_count++; + } + } + } + int log_count; + int char_x_count; + int char_0_count; +}; + +TEST(EnvPosixTest, LogBufferTest) { + TestLogger test_logger; + test_logger.SetInfoLogLevel(INFO); + test_logger.log_count = 0; + test_logger.char_x_count = 0; + test_logger.char_0_count = 0; + LogBuffer log_buffer(INFO, &test_logger); + LogBuffer log_buffer_debug(DEBUG, &test_logger); + + char bytes200[200]; + std::fill_n(bytes200, sizeof(bytes200), '1'); + bytes200[sizeof(bytes200) - 1] = '\0'; + char bytes600[600]; + std::fill_n(bytes600, sizeof(bytes600), '1'); + bytes600[sizeof(bytes600) - 1] = '\0'; + char bytes9000[9000]; + std::fill_n(bytes9000, sizeof(bytes9000), '1'); + bytes9000[sizeof(bytes9000) - 1] = '\0'; + + LogToBuffer(&log_buffer, "x%sx", bytes200); + LogToBuffer(&log_buffer, "x%sx", bytes600); + LogToBuffer(&log_buffer, "x%sx%sx%sx", bytes200, bytes200, bytes200); + LogToBuffer(&log_buffer, "x%sx%sx", bytes200, bytes600); + LogToBuffer(&log_buffer, "x%sx%sx", bytes600, bytes9000); + + LogToBuffer(&log_buffer_debug, "x%sx", bytes200); + test_logger.SetInfoLogLevel(DEBUG); + LogToBuffer(&log_buffer_debug, "x%sx%sx%sx", bytes600, bytes9000, bytes200); + + ASSERT_EQ(0, test_logger.log_count); + log_buffer.FlushBufferToLog(); + log_buffer_debug.FlushBufferToLog(); + ASSERT_EQ(6, test_logger.log_count); + ASSERT_EQ(6, test_logger.char_0_count); + ASSERT_EQ(10, test_logger.char_x_count); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/filelock_test.cc b/util/filelock_test.cc new file mode 100644 index 00000000..a9e30a5d --- /dev/null +++ b/util/filelock_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/status.h" +#include "rocksdb/env.h" + +#include +#include "util/coding.h" +#include "util/testharness.h" + +namespace rocksdb { + +class LockTest { + public: + static LockTest* current_; + std::string file_; + rocksdb::Env* env_; + + LockTest() : file_(test::TmpDir() + "/db_testlock_file"), + env_(rocksdb::Env::Default()) { + current_ = this; + } + + ~LockTest() { + } + + Status LockFile(FileLock** db_lock) { + return env_->LockFile(file_, db_lock); + } + + Status UnlockFile(FileLock* db_lock) { + return env_->UnlockFile(db_lock); + } +}; +LockTest* LockTest::current_; + +TEST(LockTest, LockBySameThread) { + FileLock* lock1; + FileLock* lock2; + + // acquire a lock on a file + ASSERT_OK(LockFile(&lock1)); + + // re-acquire the lock on the same file. This should fail. + ASSERT_TRUE(LockFile(&lock2).IsIOError()); + + // release the lock + ASSERT_OK(UnlockFile(lock1)); + +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/filter_policy.cc b/util/filter_policy.cc new file mode 100644 index 00000000..e950b75f --- /dev/null +++ b/util/filter_policy.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +FilterPolicy::~FilterPolicy() { } + +} // namespace rocksdb diff --git a/util/hash.cc b/util/hash.cc new file mode 100644 index 00000000..e38c186c --- /dev/null +++ b/util/hash.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // Similar to murmur hash + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = seed ^ (n * m); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + case 3: + h += data[2] << 16; + // fall through + case 2: + h += data[1] << 8; + // fall through + case 1: + h += data[0]; + h *= m; + h ^= (h >> r); + break; + } + return h; +} + +} // namespace rocksdb diff --git a/util/hash.h b/util/hash.h new file mode 100644 index 00000000..c9eb659a --- /dev/null +++ b/util/hash.h @@ -0,0 +1,20 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Simple hash function used for internal data structures + +#pragma once +#include +#include + +namespace rocksdb { + +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +} diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc new file mode 100644 index 00000000..441f5c99 --- /dev/null +++ b/util/hash_linklist_rep.cc @@ -0,0 +1,486 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "util/hash_linklist_rep.h" + +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +typedef const char* Key; + +struct Node { + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + Node* Next() { + // Use an 'acquire load' so that we observe a fully initialized + // version of the returned Node. + return reinterpret_cast(next_.Acquire_Load()); + } + void SetNext(Node* x) { + // Use a 'release store' so that anybody who reads through this + // pointer observes a fully initialized version of the inserted node. + next_.Release_Store(x); + } + // No-barrier variants that can be safely used in a few locations. + Node* NoBarrier_Next() { + return reinterpret_cast(next_.NoBarrier_Load()); + } + + void NoBarrier_SetNext(Node* x) { + next_.NoBarrier_Store(x); + } + + private: + port::AtomicPointer next_; + public: + char key[0]; +}; + +class HashLinkListRep : public MemTableRep { + public: + HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size); + + virtual KeyHandle Allocate(const size_t len, char** buf) override; + + virtual void Insert(KeyHandle handle) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + virtual ~HashLinkListRep(); + + virtual MemTableRep::Iterator* GetIterator() override; + + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; + + virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) + override; + + virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; + + private: + friend class DynamicIterator; + typedef SkipList FullList; + + size_t bucket_size_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + const MemTableRep::KeyComparator& compare_; + + bool BucketContains(Node* head, const Slice& key) const; + + Slice GetPrefix(const Slice& internal_key) const { + return transform_->Transform(ExtractUserKey(internal_key)); + } + + size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + + Node* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + + Node* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + + bool Equal(const Slice& a, const Key& b) const { + return (compare_(b, a) == 0); + } + + + bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + + bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, internal_key) < 0); + } + + bool KeyIsAfterNode(const Key& key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) < 0); + } + + + Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; + + class FullListIterator : public MemTableRep::Iterator { + public: + explicit FullListIterator(FullList* list, Arena* arena) + : iter_(list), full_list_(list), arena_(arena) {} + + virtual ~FullListIterator() { + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + const char* encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + iter_.SeekToFirst(); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + iter_.SeekToLast(); + } + private: + FullList::Iterator iter_; + // To destruct with the iterator. + std::unique_ptr full_list_; + std::unique_ptr arena_; + std::string tmp_; // For passing to EncodeKey + }; + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(const HashLinkListRep* const hash_link_list_rep, + Node* head) : + hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) { + } + + virtual ~Iterator() { + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return node_ != nullptr; + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return node_->key; + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + node_ = node_->Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, + internal_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + protected: + void Reset(Node* head) { + head_ = head; + node_ = nullptr; + } + private: + friend class HashLinkListRep; + const HashLinkListRep* const hash_link_list_rep_; + Node* head_; + Node* node_; + std::string tmp_; // For passing to EncodeKey + + virtual void SeekToHead() { + node_ = head_; + } + }; + + class DynamicIterator : public HashLinkListRep::Iterator { + public: + explicit DynamicIterator(HashLinkListRep& memtable_rep) + : HashLinkListRep::Iterator(&memtable_rep, nullptr), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& k, const char* memtable_key) { + auto transformed = memtable_rep_.GetPrefix(k); + Reset(memtable_rep_.GetBucket(transformed)); + HashLinkListRep::Iterator::Seek(k, memtable_key); + } + + private: + // the underlying memtable + const HashLinkListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const Slice& user_key, const char* memtable_key) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; +}; + +HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, + size_t bucket_size) + : MemTableRep(arena), + bucket_size_(bucket_size), + transform_(transform), + compare_(compare) { + char* mem = arena_->AllocateAligned( + sizeof(port::AtomicPointer) * bucket_size); + + buckets_ = new (mem) port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +HashLinkListRep::~HashLinkListRep() { +} + +KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { + char* mem = arena_->AllocateAligned(sizeof(Node) + len); + Node* x = new (mem) Node(); + *buf = x->key; + return static_cast(x); +} + +void HashLinkListRep::Insert(KeyHandle handle) { + Node* x = static_cast(handle); + assert(!Contains(x->key)); + Slice internal_key = GetLengthPrefixedSlice(x->key); + auto transformed = GetPrefix(internal_key); + auto& bucket = buckets_[GetHash(transformed)]; + Node* head = static_cast(bucket.Acquire_Load()); + + if (!head) { + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(nullptr); + bucket.Release_Store(static_cast(x)); + return; + } + + Node* cur = head; + Node* prev = nullptr; + while (true) { + if (cur == nullptr) { + break; + } + Node* next = cur->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((cur == head) || (next == nullptr) || + KeyIsAfterNode(next->key, cur)); + if (KeyIsAfterNode(internal_key, cur)) { + // Keep searching in this list + prev = cur; + cur = next; + } else { + break; + } + } + + // Our data structure does not allow duplicate insertion + assert(cur == nullptr || !Equal(x->key, cur->key)); + + // NoBarrier_SetNext() suffices since we will add a barrier when + // we publish a pointer to "x" in prev[i]. + x->NoBarrier_SetNext(cur); + + if (prev) { + prev->SetNext(x); + } else { + bucket.Release_Store(static_cast(x)); + } +} + +bool HashLinkListRep::Contains(const char* key) const { + Slice internal_key = GetLengthPrefixedSlice(key); + + auto transformed = GetPrefix(internal_key); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return BucketContains(bucket, internal_key); +} + +size_t HashLinkListRep::ApproximateMemoryUsage() { + // Memory is always allocated from the arena. + return 0; +} + +void HashLinkListRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto transformed = transform_->Transform(k.user_key()); + auto bucket = GetBucket(transformed); + if (bucket != nullptr) { + Iterator iter(this, bucket); + for (iter.Seek(k.internal_key(), nullptr); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator() { + // allocate a new arena of similar size to the one currently in use + Arena* new_arena = new Arena(arena_->BlockSize()); + auto list = new FullList(compare_, new_arena); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Iterator itr(this, bucket); + for (itr.SeekToHead(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + return new FullListIterator(list, new_arena); +} + +MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator( + const Slice& prefix) { + auto bucket = GetBucket(prefix); + if (bucket == nullptr) { + return new EmptyIterator(); + } + return new Iterator(this, bucket); +} + +MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) { + return GetPrefixIterator(transform_->Transform(slice)); +} + +MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() { + return new DynamicIterator(*this); +} + +bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const { + Node* x = FindGreaterOrEqualInBucket(head, user_key); + return (x != nullptr && Equal(user_key, x->key)); +} + +Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, + const Slice& key) const { + Node* x = head; + while (true) { + if (x == nullptr) { + return x; + } + Node* next = x->Next(); + // Make sure the lists are sorted. + // If x points to head_ or next points nullptr, it is trivially satisfied. + assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); + if (KeyIsAfterNode(key, x)) { + // Keep searching in this list + x = next; + } else { + break; + } + } + return x; +} + +} // anon namespace + +MemTableRep* HashLinkListRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform) { + return new HashLinkListRep(compare, arena, transform, bucket_count_); +} + +MemTableRepFactory* NewHashLinkListRepFactory(size_t bucket_count) { + return new HashLinkListRepFactory(bucket_count); +} + +} // namespace rocksdb diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h new file mode 100644 index 00000000..11fb7467 --- /dev/null +++ b/util/hash_linklist_rep.h @@ -0,0 +1,34 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashLinkListRepFactory : public MemTableRepFactory { + public: + explicit HashLinkListRepFactory(size_t bucket_count) + : bucket_count_(bucket_count) { } + + virtual ~HashLinkListRepFactory() {} + + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform) override; + + virtual const char* Name() const override { + return "HashLinkListRepFactory"; + } + + private: + const size_t bucket_count_; +}; + +} diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc new file mode 100644 index 00000000..230fae95 --- /dev/null +++ b/util/hash_skiplist_rep.cc @@ -0,0 +1,341 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "util/hash_skiplist_rep.h" + +#include "rocksdb/memtablerep.h" +#include "util/arena.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "port/port.h" +#include "port/atomic_pointer.h" +#include "util/murmurhash.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { + +class HashSkipListRep : public MemTableRep { + public: + HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, size_t bucket_size, + int32_t skiplist_height, int32_t skiplist_branching_factor); + + virtual void Insert(KeyHandle handle) override; + + virtual bool Contains(const char* key) const override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + virtual ~HashSkipListRep(); + + virtual MemTableRep::Iterator* GetIterator() override; + + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; + + virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) + override; + + virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; + + private: + friend class DynamicIterator; + typedef SkipList Bucket; + + size_t bucket_size_; + + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; + + // Maps slices (which are transformed user keys) to buckets of keys sharing + // the same transform. + port::AtomicPointer* buckets_; + + // The user-supplied transform whose domain is the user keys. + const SliceTransform* transform_; + + const MemTableRep::KeyComparator& compare_; + // immutable after construction + Arena* const arena_; + + inline size_t GetHash(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + } + inline Bucket* GetBucket(size_t i) const { + return static_cast(buckets_[i].Acquire_Load()); + } + inline Bucket* GetBucket(const Slice& slice) const { + return GetBucket(GetHash(slice)); + } + // Get a bucket from buckets_. If the bucket hasn't been initialized yet, + // initialize it before returning. + Bucket* GetInitializedBucket(const Slice& transformed); + + class Iterator : public MemTableRep::Iterator { + public: + explicit Iterator(Bucket* list, bool own_list = true, + Arena* arena = nullptr) + : list_(list), iter_(list), own_list_(own_list), arena_(arena) {} + + virtual ~Iterator() { + // if we own the list, we should also delete it + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const { + return list_ != nullptr && iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const { + assert(Valid()); + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() { + assert(Valid()); + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() { + assert(Valid()); + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& internal_key, const char* memtable_key) { + if (list_ != nullptr) { + const char* encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + iter_.Seek(encoded_key); + } + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + if (list_ != nullptr) { + iter_.SeekToFirst(); + } + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + if (list_ != nullptr) { + iter_.SeekToLast(); + } + } + protected: + void Reset(Bucket* list) { + if (own_list_) { + assert(list_ != nullptr); + delete list_; + } + list_ = list; + iter_.SetList(list); + own_list_ = false; + } + private: + // if list_ is nullptr, we should NEVER call any methods on iter_ + // if list_ is nullptr, this Iterator is not Valid() + Bucket* list_; + Bucket::Iterator iter_; + // here we track if we own list_. If we own it, we are also + // responsible for it's cleaning. This is a poor man's shared_ptr + bool own_list_; + std::unique_ptr arena_; + std::string tmp_; // For passing to EncodeKey + }; + + class DynamicIterator : public HashSkipListRep::Iterator { + public: + explicit DynamicIterator(const HashSkipListRep& memtable_rep) + : HashSkipListRep::Iterator(nullptr, false), + memtable_rep_(memtable_rep) {} + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& k, const char* memtable_key) { + auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k)); + Reset(memtable_rep_.GetBucket(transformed)); + HashSkipListRep::Iterator::Seek(k, memtable_key); + } + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() { + // Prefix iterator does not support total order. + // We simply set the iterator to invalid state + Reset(nullptr); + } + private: + // the underlying memtable + const HashSkipListRep& memtable_rep_; + }; + + class EmptyIterator : public MemTableRep::Iterator { + // This is used when there wasn't a bucket. It is cheaper than + // instantiating an empty bucket over which to iterate. + public: + EmptyIterator() { } + virtual bool Valid() const { + return false; + } + virtual const char* key() const { + assert(false); + return nullptr; + } + virtual void Next() { } + virtual void Prev() { } + virtual void Seek(const Slice& internal_key, + const char* memtable_key) { } + virtual void SeekToFirst() { } + virtual void SeekToLast() { } + private: + }; +}; + +HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, + Arena* arena, const SliceTransform* transform, + size_t bucket_size, int32_t skiplist_height, + int32_t skiplist_branching_factor) + : MemTableRep(arena), + bucket_size_(bucket_size), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor), + transform_(transform), + compare_(compare), + arena_(arena) { + buckets_ = new port::AtomicPointer[bucket_size]; + + for (size_t i = 0; i < bucket_size_; ++i) { + buckets_[i].NoBarrier_Store(nullptr); + } +} + +HashSkipListRep::~HashSkipListRep() { + delete[] buckets_; +} + +HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( + const Slice& transformed) { + size_t hash = GetHash(transformed); + auto bucket = GetBucket(hash); + if (bucket == nullptr) { + auto addr = arena_->AllocateAligned(sizeof(Bucket)); + bucket = new (addr) Bucket(compare_, arena_, skiplist_height_, + skiplist_branching_factor_); + buckets_[hash].Release_Store(static_cast(bucket)); + } + return bucket; +} + +void HashSkipListRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); + assert(!Contains(key)); + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetInitializedBucket(transformed); + bucket->Insert(key); +} + +bool HashSkipListRep::Contains(const char* key) const { + auto transformed = transform_->Transform(UserKey(key)); + auto bucket = GetBucket(transformed); + if (bucket == nullptr) { + return false; + } + return bucket->Contains(key); +} + +size_t HashSkipListRep::ApproximateMemoryUsage() { + return sizeof(buckets_); +} + +void HashSkipListRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto transformed = transform_->Transform(k.user_key()); + auto bucket = GetBucket(transformed); + if (bucket != nullptr) { + Bucket::Iterator iter(bucket); + for (iter.Seek(k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } +} + +MemTableRep::Iterator* HashSkipListRep::GetIterator() { + // allocate a new arena of similar size to the one currently in use + Arena* new_arena = new Arena(arena_->BlockSize()); + auto list = new Bucket(compare_, new_arena); + for (size_t i = 0; i < bucket_size_; ++i) { + auto bucket = GetBucket(i); + if (bucket != nullptr) { + Bucket::Iterator itr(bucket); + for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { + list->Insert(itr.key()); + } + } + } + return new Iterator(list, true, new_arena); +} + +MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) { + auto bucket = GetBucket(prefix); + if (bucket == nullptr) { + return new EmptyIterator(); + } + return new Iterator(bucket, false); +} + +MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) { + return GetPrefixIterator(transform_->Transform(slice)); +} + +MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() { + return new DynamicIterator(*this); +} + +} // anon namespace + +MemTableRep* HashSkipListRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform) { + return new HashSkipListRep(compare, arena, transform, bucket_count_, + skiplist_height_, skiplist_branching_factor_); +} + +MemTableRepFactory* NewHashSkipListRepFactory( + size_t bucket_count, int32_t skiplist_height, + int32_t skiplist_branching_factor) { + return new HashSkipListRepFactory(bucket_count, skiplist_height, + skiplist_branching_factor); +} + +} // namespace rocksdb diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h new file mode 100644 index 00000000..abf4a68c --- /dev/null +++ b/util/hash_skiplist_rep.h @@ -0,0 +1,41 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/slice_transform.h" +#include "rocksdb/memtablerep.h" + +namespace rocksdb { + +class HashSkipListRepFactory : public MemTableRepFactory { + public: + explicit HashSkipListRepFactory( + size_t bucket_count, + int32_t skiplist_height, + int32_t skiplist_branching_factor) + : bucket_count_(bucket_count), + skiplist_height_(skiplist_height), + skiplist_branching_factor_(skiplist_branching_factor) { } + + virtual ~HashSkipListRepFactory() {} + + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform) override; + + virtual const char* Name() const override { + return "HashSkipListRepFactory"; + } + + private: + const size_t bucket_count_; + const int32_t skiplist_height_; + const int32_t skiplist_branching_factor_; +}; + +} diff --git a/util/histogram.cc b/util/histogram.cc new file mode 100644 index 00000000..968769ce --- /dev/null +++ b/util/histogram.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/histogram.h" + +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +HistogramBucketMapper::HistogramBucketMapper() + : + // Add newer bucket index here. + // Should be alwyas added in sorted order. + // If you change this, you also need to change + // size of array buckets_ in HistogramImpl + bucketValues_( + {1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 12, 14, + 16, 18, 20, 25, 30, 35, + 40, 45, 50, 60, 70, 80, + 90, 100, 120, 140, 160, 180, + 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, + 1200, 1400, 1600, 1800, 2000, 2500, + 3000, 3500, 4000, 4500, 5000, 6000, + 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, + 40000, 45000, 50000, 60000, 70000, 80000, + 90000, 100000, 120000, 140000, 160000, 180000, + 200000, 250000, 300000, 350000, 400000, 450000, + 500000, 600000, 700000, 800000, 900000, 1000000, + 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, + 7000000, 8000000, 9000000, 10000000, 12000000, 14000000, + 16000000, 18000000, 20000000, 25000000, 30000000, 35000000, + 40000000, 45000000, 50000000, 60000000, 70000000, 80000000, + 90000000, 100000000, 120000000, 140000000, 160000000, 180000000, + 200000000, 250000000, 300000000, 350000000, 400000000, 450000000, + 500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}), + maxBucketValue_(bucketValues_.back()), + minBucketValue_(bucketValues_.front()) { + for (size_t i =0; i < bucketValues_.size(); ++i) { + valueIndexMap_[bucketValues_[i]] = i; + } +} + +const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { + if (value >= maxBucketValue_) { + return bucketValues_.size() - 1; + } else if ( value >= minBucketValue_ ) { + std::map::const_iterator lowerBound = + valueIndexMap_.lower_bound(value); + if (lowerBound != valueIndexMap_.end()) { + return lowerBound->second; + } else { + return 0; + } + } else { + return 0; + } +} + +namespace { + const HistogramBucketMapper bucketMapper; +} + +void HistogramImpl::Clear() { + min_ = bucketMapper.LastValue(); + max_ = 0; + num_ = 0; + sum_ = 0; + sum_squares_ = 0; + memset(buckets_, 0, sizeof buckets_); +} + +bool HistogramImpl::Empty() { return sum_squares_ == 0; } + +void HistogramImpl::Add(uint64_t value) { + const size_t index = bucketMapper.IndexForValue(value); + buckets_[index] += 1; + if (min_ > value) min_ = value; + if (max_ < value) max_ = value; + num_++; + sum_ += value; + sum_squares_ += (value * value); +} + +void HistogramImpl::Merge(const HistogramImpl& other) { + if (other.min_ < min_) min_ = other.min_; + if (other.max_ > max_) max_ = other.max_; + num_ += other.num_; + sum_ += other.sum_; + sum_squares_ += other.sum_squares_; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + buckets_[b] += other.buckets_[b]; + } +} + +double HistogramImpl::Median() const { + return Percentile(50.0); +} + +double HistogramImpl::Percentile(double p) const { + double threshold = num_ * (p / 100.0); + double sum = 0; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + sum += buckets_[b]; + if (sum >= threshold) { + // Scale linearly within this bucket + double left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1); + double right_point = bucketMapper.BucketLimit(b); + double left_sum = sum - buckets_[b]; + double right_sum = sum; + double pos = 0; + double right_left_diff = right_sum - left_sum; + if (right_left_diff != 0) { + pos = (threshold - left_sum) / (right_sum - left_sum); + } + double r = left_point + (right_point - left_point) * pos; + if (r < min_) r = min_; + if (r > max_) r = max_; + return r; + } + } + return max_; +} + +double HistogramImpl::Average() const { + if (num_ == 0.0) return 0; + return sum_ / num_; +} + +double HistogramImpl::StandardDeviation() const { + if (num_ == 0.0) return 0; + double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); + return sqrt(variance); +} + +std::string HistogramImpl::ToString() const { + std::string r; + char buf[200]; + snprintf(buf, sizeof(buf), + "Count: %.0f Average: %.4f StdDev: %.2f\n", + num_, Average(), StandardDeviation()); + r.append(buf); + snprintf(buf, sizeof(buf), + "Min: %.4f Median: %.4f Max: %.4f\n", + (num_ == 0.0 ? 0.0 : min_), Median(), max_); + r.append(buf); + snprintf(buf, sizeof(buf), + "Percentiles: " + "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n", + Percentile(50), Percentile(75), Percentile(99), Percentile(99.9), + Percentile(99.99)); + r.append(buf); + r.append("------------------------------------------------------\n"); + const double mult = 100.0 / num_; + double sum = 0; + for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { + if (buckets_[b] <= 0.0) continue; + sum += buckets_[b]; + snprintf(buf, sizeof(buf), + "[ %7lu, %7lu ) %8lu %7.3f%% %7.3f%% ", + // left + (unsigned long)((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)), + (unsigned long)bucketMapper.BucketLimit(b), // right + (unsigned long)buckets_[b], // count + (mult * buckets_[b]), // percentage + (mult * sum)); // cumulative percentage + r.append(buf); + + // Add hash marks based on percentage; 20 marks for 100%. + int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + r.append(marks, '#'); + r.push_back('\n'); + } + return r; +} + +void HistogramImpl::Data(HistogramData * const data) const { + assert(data); + data->median = Median(); + data->percentile95 = Percentile(95); + data->percentile99 = Percentile(99); + data->average = Average(); + data->standard_deviation = StandardDeviation(); +} + +} // namespace levedb diff --git a/util/histogram.h b/util/histogram.h new file mode 100644 index 00000000..d95588dc --- /dev/null +++ b/util/histogram.h @@ -0,0 +1,79 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/statistics.h" + +#include +#include +#include +#include + +namespace rocksdb { + +class HistogramBucketMapper { + public: + + HistogramBucketMapper(); + + // converts a value to the bucket index. + const size_t IndexForValue(const uint64_t value) const; + // number of buckets required. + + const size_t BucketCount() const { + return bucketValues_.size(); + } + + uint64_t LastValue() const { + return maxBucketValue_; + } + + uint64_t FirstValue() const { + return minBucketValue_; + } + + uint64_t BucketLimit(const uint64_t bucketNumber) const { + assert(bucketNumber < BucketCount()); + return bucketValues_[bucketNumber]; + } + + private: + const std::vector bucketValues_; + const uint64_t maxBucketValue_; + const uint64_t minBucketValue_; + std::map valueIndexMap_; +}; + +class HistogramImpl { + public: + virtual void Clear(); + virtual bool Empty(); + virtual void Add(uint64_t value); + void Merge(const HistogramImpl& other); + + virtual std::string ToString() const; + + virtual double Median() const; + virtual double Percentile(double p) const; + virtual double Average() const; + virtual double StandardDeviation() const; + virtual void Data(HistogramData * const data) const; + + private: + // To be able to use HistogramImpl as thread local variable, its constructor + // has to be static. That's why we're using manually values from BucketMapper + double min_ = 1000000000; // this is BucketMapper:LastValue() + double max_ = 0; + double num_ = 0; + double sum_ = 0; + double sum_squares_ = 0; + uint64_t buckets_[138] = {0}; // this is BucketMapper::BucketCount() +}; + +} // namespace rocksdb diff --git a/util/histogram_test.cc b/util/histogram_test.cc new file mode 100644 index 00000000..065f9579 --- /dev/null +++ b/util/histogram_test.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/histogram.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class HistogramTest { }; + +TEST(HistogramTest, BasicOperation) { + + HistogramImpl histogram; + for (uint64_t i = 1; i <= 100; i++) { + histogram.Add(i); + } + + { + double median = histogram.Median(); + // ASSERT_LE(median, 50); + ASSERT_GT(median, 0); + } + + { + double percentile100 = histogram.Percentile(100.0); + ASSERT_LE(percentile100, 100.0); + ASSERT_GT(percentile100, 0.0); + double percentile99 = histogram.Percentile(99.0); + double percentile85 = histogram.Percentile(85.0); + ASSERT_LE(percentile99, 99.0); + ASSERT_TRUE(percentile99 >= percentile85); + } + + ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately caluclated. +} + +TEST(HistogramTest, EmptyHistogram) { + HistogramImpl histogram; + ASSERT_EQ(histogram.Median(), 0.0); + ASSERT_EQ(histogram.Percentile(85.0), 0.0); + ASSERT_EQ(histogram.Average(), 0.0); +} + +TEST(HistogramTest, ClearHistogram) { + HistogramImpl histogram; + for (uint64_t i = 1; i <= 100; i++) { + histogram.Add(i); + } + histogram.Clear(); + ASSERT_EQ(histogram.Median(), 0); + ASSERT_EQ(histogram.Percentile(85.0), 0); + ASSERT_EQ(histogram.Average(), 0); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc new file mode 100644 index 00000000..30288e72 --- /dev/null +++ b/util/ldb_cmd.cc @@ -0,0 +1,1782 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/ldb_cmd.h" + +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/log_reader.h" +#include "db/filename.h" +#include "db/write_batch_internal.h" +#include "rocksdb/write_batch.h" +#include "util/coding.h" + +#include +#include +#include +#include +#include + +namespace rocksdb { + +using namespace std; + +const string LDBCommand::ARG_DB = "db"; +const string LDBCommand::ARG_HEX = "hex"; +const string LDBCommand::ARG_KEY_HEX = "key_hex"; +const string LDBCommand::ARG_VALUE_HEX = "value_hex"; +const string LDBCommand::ARG_TTL = "ttl"; +const string LDBCommand::ARG_TTL_START = "start_time"; +const string LDBCommand::ARG_TTL_END = "end_time"; +const string LDBCommand::ARG_TIMESTAMP = "timestamp"; +const string LDBCommand::ARG_FROM = "from"; +const string LDBCommand::ARG_TO = "to"; +const string LDBCommand::ARG_MAX_KEYS = "max_keys"; +const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits"; +const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type"; +const string LDBCommand::ARG_BLOCK_SIZE = "block_size"; +const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction"; +const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size"; +const string LDBCommand::ARG_FILE_SIZE = "file_size"; +const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing"; + +const char* LDBCommand::DELIM = " ==> "; + +LDBCommand* LDBCommand::InitFromCmdLineArgs( + int argc, + char** argv, + const Options& options +) { + vector args; + for (int i = 1; i < argc; i++) { + args.push_back(argv[i]); + } + return InitFromCmdLineArgs(args, options); +} + +/** + * Parse the command-line arguments and create the appropriate LDBCommand2 + * instance. + * The command line arguments must be in the following format: + * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] .. + * COMMAND ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] .. + * This is similar to the command line format used by HBaseClientTool. + * Command name is not included in args. + * Returns nullptr if the command-line cannot be parsed. + */ +LDBCommand* LDBCommand::InitFromCmdLineArgs( + const vector& args, + const Options& options +) { + // --x=y command line arguments are added as x->y map entries. + map option_map; + + // Command-line arguments of the form --hex end up in this array as hex + vector flags; + + // Everything other than option_map and flags. Represents commands + // and their parameters. For eg: put key1 value1 go into this vector. + vector cmdTokens; + + const string OPTION_PREFIX = "--"; + + for (const auto& arg : args) { + if (arg[0] == '-' && arg[1] == '-'){ + vector splits = stringSplit(arg, '='); + if (splits.size() == 2) { + string optionKey = splits[0].substr(OPTION_PREFIX.size()); + option_map[optionKey] = splits[1]; + } else { + string optionKey = splits[0].substr(OPTION_PREFIX.size()); + flags.push_back(optionKey); + } + } else { + cmdTokens.push_back(arg); + } + } + + if (cmdTokens.size() < 1) { + fprintf(stderr, "Command not specified!"); + return nullptr; + } + + string cmd = cmdTokens[0]; + vector cmdParams(cmdTokens.begin()+1, cmdTokens.end()); + LDBCommand* command = LDBCommand::SelectCommand( + cmd, + cmdParams, + option_map, + flags + ); + + if (command) { + command->SetOptions(options); + } + return command; +} + +LDBCommand* LDBCommand::SelectCommand( + const std::string& cmd, + const vector& cmdParams, + const map& option_map, + const vector& flags + ) { + + if (cmd == GetCommand::Name()) { + return new GetCommand(cmdParams, option_map, flags); + } else if (cmd == PutCommand::Name()) { + return new PutCommand(cmdParams, option_map, flags); + } else if (cmd == BatchPutCommand::Name()) { + return new BatchPutCommand(cmdParams, option_map, flags); + } else if (cmd == ScanCommand::Name()) { + return new ScanCommand(cmdParams, option_map, flags); + } else if (cmd == DeleteCommand::Name()) { + return new DeleteCommand(cmdParams, option_map, flags); + } else if (cmd == ApproxSizeCommand::Name()) { + return new ApproxSizeCommand(cmdParams, option_map, flags); + } else if (cmd == DBQuerierCommand::Name()) { + return new DBQuerierCommand(cmdParams, option_map, flags); + } else if (cmd == CompactorCommand::Name()) { + return new CompactorCommand(cmdParams, option_map, flags); + } else if (cmd == WALDumperCommand::Name()) { + return new WALDumperCommand(cmdParams, option_map, flags); + } else if (cmd == ReduceDBLevelsCommand::Name()) { + return new ReduceDBLevelsCommand(cmdParams, option_map, flags); + } else if (cmd == ChangeCompactionStyleCommand::Name()) { + return new ChangeCompactionStyleCommand(cmdParams, option_map, flags); + } else if (cmd == DBDumperCommand::Name()) { + return new DBDumperCommand(cmdParams, option_map, flags); + } else if (cmd == DBLoaderCommand::Name()) { + return new DBLoaderCommand(cmdParams, option_map, flags); + } else if (cmd == ManifestDumpCommand::Name()) { + return new ManifestDumpCommand(cmdParams, option_map, flags); + } else if (cmd == InternalDumpCommand::Name()) { + return new InternalDumpCommand(cmdParams, option_map, flags); + } else if (cmd == CheckConsistencyCommand::Name()) { + return new CheckConsistencyCommand(cmdParams, option_map, flags); + } + return nullptr; +} + + +/** + * Parses the specific integer option and fills in the value. + * Returns true if the option is found. + * Returns false if the option is not found or if there is an error parsing the + * value. If there is an error, the specified exec_state is also + * updated. + */ +bool LDBCommand::ParseIntOption(const map& options, + const string& option, int& value, + LDBCommandExecuteResult& exec_state) { + + map::const_iterator itr = option_map_.find(option); + if (itr != option_map_.end()) { + try { + value = stoi(itr->second); + return true; + } catch(const invalid_argument&) { + exec_state = LDBCommandExecuteResult::FAILED(option + + " has an invalid value."); + } catch(const out_of_range&) { + exec_state = LDBCommandExecuteResult::FAILED(option + + " has a value out-of-range."); + } + } + return false; +} + +/** + * Parses the specified option and fills in the value. + * Returns true if the option is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseStringOption(const map& options, + const string& option, string* value) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + *value = itr->second; + return true; + } + return false; +} + +Options LDBCommand::PrepareOptionsForOpenDB() { + + Options opt = options_; + opt.create_if_missing = false; + + map::const_iterator itr; + + int bits; + if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) { + if (bits > 0) { + opt.filter_policy = NewBloomFilterPolicy(bits); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS + + " must be > 0."); + } + } + + int block_size; + if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) { + if (block_size > 0) { + opt.block_size = block_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE + + " must be > 0."); + } + } + + itr = option_map_.find(ARG_AUTO_COMPACTION); + if (itr != option_map_.end()) { + opt.disable_auto_compactions = ! StringToBool(itr->second); + } + + itr = option_map_.find(ARG_COMPRESSION_TYPE); + if (itr != option_map_.end()) { + string comp = itr->second; + if (comp == "no") { + opt.compression = kNoCompression; + } else if (comp == "snappy") { + opt.compression = kSnappyCompression; + } else if (comp == "zlib") { + opt.compression = kZlibCompression; + } else if (comp == "bzip2") { + opt.compression = kBZip2Compression; + } else if (comp == "lz4") { + opt.compression = kLZ4Compression; + } else if (comp == "lz4hc") { + opt.compression = kLZ4HCCompression; + } else { + // Unknown compression. + exec_state_ = LDBCommandExecuteResult::FAILED( + "Unknown compression level: " + comp); + } + } + + int write_buffer_size; + if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, + exec_state_)) { + if (write_buffer_size > 0) { + opt.write_buffer_size = write_buffer_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_WRITE_BUFFER_SIZE + + " must be > 0."); + } + } + + int file_size; + if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) { + if (file_size > 0) { + opt.target_file_size_base = file_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FILE_SIZE + + " must be > 0."); + } + } + + return opt; +} + +bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value, + bool is_key_hex, bool is_value_hex) { + size_t pos = line.find(DELIM); + if (pos != string::npos) { + *key = line.substr(0, pos); + *value = line.substr(pos + strlen(DELIM)); + if (is_key_hex) { + *key = HexToString(*key); + } + if (is_value_hex) { + *value = HexToString(*value); + } + return true; + } else { + return false; + } +} + +/** + * Make sure that ONLY the command-line options and flags expected by this + * command are specified on the command-line. Extraneous options are usually + * the result of user error. + * Returns true if all checks pass. Else returns false, and prints an + * appropriate error msg to stderr. + */ +bool LDBCommand::ValidateCmdLineOptions() { + + for (map::const_iterator itr = option_map_.begin(); + itr != option_map_.end(); itr++) { + if (find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), itr->first) == + valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str()); + return false; + } + } + + for (vector::const_iterator itr = flags_.begin(); + itr != flags_.end(); itr++) { + if (find(valid_cmd_line_options_.begin(), + valid_cmd_line_options_.end(), *itr) == + valid_cmd_line_options_.end()) { + fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str()); + return false; + } + } + + if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end()) { + fprintf(stderr, "%s must be specified\n", ARG_DB.c_str()); + return false; + } + + return true; +} + +CompactorCommand::CompactorCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_TTL})), + null_from_(true), null_to_(true) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void CompactorCommand::Help(string& ret) { + ret.append(" "); + ret.append(CompactorCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void CompactorCommand::DoCommand() { + + Slice* begin = nullptr; + Slice* end = nullptr; + if (!null_from_) { + begin = new Slice(from_); + } + if (!null_to_) { + end = new Slice(to_); + } + + db_->CompactRange(begin, end); + exec_state_ = LDBCommandExecuteResult::SUCCEED(""); + + delete begin; + delete end; +} + +const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal"; +const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; +const string DBLoaderCommand::ARG_COMPACT = "compact"; + +DBLoaderCommand::DBLoaderCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING, + ARG_DISABLE_WAL, ARG_BULK_LOAD, + ARG_COMPACT})), + create_if_missing_(false), disable_wal_(false), bulk_load_(false), + compact_(false) { + + create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING); + disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL); + bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD); + compact_ = IsFlagPresent(flags, ARG_COMPACT); +} + +void DBLoaderCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBLoaderCommand::Name()); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_DISABLE_WAL + "]"); + ret.append(" [--" + ARG_BULK_LOAD + "]"); + ret.append(" [--" + ARG_COMPACT + "]"); + ret.append("\n"); +} + +Options DBLoaderCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = create_if_missing_; + if (bulk_load_) { + opt.PrepareForBulkLoad(); + } + return opt; +} + +void DBLoaderCommand::DoCommand() { + if (!db_) { + return; + } + + WriteOptions write_options; + if (disable_wal_) { + write_options.disableWAL = true; + } + + int bad_lines = 0; + string line; + while (getline(cin, line, '\n')) { + string key; + string value; + if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) { + db_->Put(write_options, Slice(key), Slice(value)); + } else if (0 == line.find("Keys in range:")) { + // ignore this line + } else if (0 == line.find("Created bg thread 0x")) { + // ignore this line + } else { + bad_lines ++; + } + } + + if (bad_lines > 0) { + cout << "Warning: " << bad_lines << " bad lines ignored." << endl; + } + if (compact_) { + db_->CompactRange(nullptr, nullptr); + } +} + +// ---------------------------------------------------------------------------- + +const string ManifestDumpCommand::ARG_VERBOSE = "verbose"; +const string ManifestDumpCommand::ARG_PATH = "path"; + +void ManifestDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(ManifestDumpCommand::Name()); + ret.append(" [--" + ARG_VERBOSE + "]"); + ret.append(" [--" + ARG_PATH + "=]"); + ret.append("\n"); +} + +ManifestDumpCommand::ManifestDumpCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})), + verbose_(false), + path_("") +{ + verbose_ = IsFlagPresent(flags, ARG_VERBOSE); + + map::const_iterator itr = options.find(ARG_PATH); + if (itr != options.end()) { + path_ = itr->second; + if (path_.empty()) { + exec_state_ = LDBCommandExecuteResult::FAILED("--path: missing pathname"); + } + } +} + +void ManifestDumpCommand::DoCommand() { + + std::string manifestfile; + + if (!path_.empty()) { + manifestfile = path_; + } else { + bool found = false; + // We need to find the manifest file by searching the directory + // containing the db for files of the form MANIFEST_[0-9]+ + DIR* d = opendir(db_path_.c_str()); + if (d == nullptr) { + exec_state_ = LDBCommandExecuteResult::FAILED( + db_path_ + " is not a directory"); + return; + } + struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { + unsigned int match; + unsigned long long num; + if (sscanf(entry->d_name, + "MANIFEST-%ln%ln", + (unsigned long*)&num, + (unsigned long*)&match) + && match == strlen(entry->d_name)) { + if (!found) { + manifestfile = db_path_ + "/" + std::string(entry->d_name); + found = true; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Multiple MANIFEST files found; use --path to select one"); + return; + } + } + } + closedir(d); + } + + if (verbose_) { + printf("Processing Manifest file %s\n", manifestfile.c_str()); + } + + Options options; + EnvOptions sopt; + std::string file(manifestfile); + std::string dbname("dummy"); + TableCache* tc = new TableCache(dbname, &options, sopt, 10); + const InternalKeyComparator* cmp = + new InternalKeyComparator(options.comparator); + + VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp); + Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_); + if (!s.ok()) { + printf("Error in processing file %s %s\n", manifestfile.c_str(), + s.ToString().c_str()); + } + if (verbose_) { + printf("Processing Manifest file %s done\n", manifestfile.c_str()); + } +} + +// ---------------------------------------------------------------------------- + +string ReadableTime(int unixtime) { + char time_buffer [80]; + time_t rawtime = unixtime; + struct tm * timeinfo = localtime(&rawtime); + strftime(time_buffer, 80, "%c", timeinfo); + return string(time_buffer); +} + +// This function only called when it's the sane case of >1 buckets in time-range +// Also called only when timekv falls between ttl_start and ttl_end provided +void IncBucketCounts(vector& bucket_counts, int ttl_start, + int time_range, int bucket_size, int timekv, int num_buckets) { + assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 && + timekv < (ttl_start + time_range) && num_buckets > 1); + int bucket = (timekv - ttl_start) / bucket_size; + bucket_counts[bucket]++; +} + +void PrintBucketCounts(const vector& bucket_counts, int ttl_start, + int ttl_end, int bucket_size, int num_buckets) { + int time_point = ttl_start; + for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { + fprintf(stdout, "Keys in range %s to %s : %lu\n", + ReadableTime(time_point).c_str(), + ReadableTime(time_point + bucket_size).c_str(), + (unsigned long)bucket_counts[i]); + } + fprintf(stdout, "Keys in range %s to %s : %lu\n", + ReadableTime(time_point).c_str(), + ReadableTime(ttl_end).c_str(), + (unsigned long)bucket_counts[num_buckets - 1]); +} + +const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only"; +const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim"; +const string InternalDumpCommand::ARG_STATS = "stats"; +const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex"; + +InternalDumpCommand::InternalDumpCommand(const vector& params, + const map& options, + const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_MAX_KEYS, + ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS, + ARG_INPUT_KEY_HEX})), + has_from_(false), + has_to_(false), + max_keys_(-1), + delim_("."), + count_only_(false), + count_delim_(false), + print_stats_(false), + is_input_key_hex_(false) { + + has_from_ = ParseStringOption(options, ARG_FROM, &from_); + has_to_ = ParseStringOption(options, ARG_TO, &to_); + + ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_); + map::const_iterator itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + // fprintf(stdout,"delim = %c\n",delim_[0]); + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_="."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX); + + if (is_input_key_hex_) { + if (has_from_) { + from_ = HexToString(from_); + } + if (has_to_) { + to_ = HexToString(to_); + } + } +} + +void InternalDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(InternalDumpCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_INPUT_KEY_HEX + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append("\n"); +} + +void InternalDumpCommand::DoCommand() { + if (!db_) { + return; + } + + if (print_stats_) { + string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Cast as DBImpl to get internal iterator + DBImpl* idb = dynamic_cast(db_); + if (!idb) { + exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl"); + return; + } + string rtype1,rtype2,row,val; + rtype2 = ""; + uint64_t c=0; + uint64_t s1=0,s2=0; + // Setup internal key iterator + auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" + + st.ToString()); + } + + if (has_from_) { + InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + long long count = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + fprintf(stderr, "Internal Key [%s] parse error!\n", + iter->key().ToString(true /* in hex*/).data()); + // TODO: add error counter + continue; + } + + // If end marker was specified, we stop before it + if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) { + break; + } + + ++count; + int k; + if (count_delim_) { + rtype1 = ""; + s1=0; + row = iter->key().ToString(); + val = iter->value().ToString(); + for(k=0;row[k]!='\x01' && row[k]!='\0';k++) + s1++; + for(k=0;val[k]!='\x01' && val[k]!='\0';k++) + s1++; + for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++) + rtype1+=row[j]; + if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long)c,(long long)s2); + c=1; + s2=s1; + rtype2 = rtype1; + } else { + c++; + s2+=s1; + rtype2=rtype1; + } + } + + if (!count_only_ && !count_delim_) { + string key = ikey.DebugString(is_key_hex_); + string value = iter->value().ToString(is_value_hex_); + std::cout << key << " => " << value << "\n"; + } + + // Terminate if maximum number of keys have been dumped + if (max_keys_ > 0 && count >= max_keys_) break; + } + if(count_delim_) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(), + (long long)c,(long long)s2); + } else + fprintf(stdout, "Internal keys in range: %lld\n", (long long) count); +} + + +const string DBDumperCommand::ARG_COUNT_ONLY = "count_only"; +const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim"; +const string DBDumperCommand::ARG_STATS = "stats"; +const string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; + +DBDumperCommand::DBDumperCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_FROM, ARG_TO, + ARG_MAX_KEYS, ARG_COUNT_ONLY, + ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START, + ARG_TTL_END, ARG_TTL_BUCKET, + ARG_TIMESTAMP})), + null_from_(true), + null_to_(true), + max_keys_(-1), + count_only_(false), + count_delim_(false), + print_stats_(false) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + null_from_ = false; + from_ = itr->second; + } + + itr = options.find(ARG_TO); + if (itr != options.end()) { + null_to_ = false; + to_ = itr->second; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { + max_keys_ = stoi(itr->second); + } catch(const invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has an invalid value"); + } catch(const out_of_range&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has a value out-of-range"); + } + } + itr = options.find(ARG_COUNT_DELIM); + if (itr != options.end()) { + delim_ = itr->second; + count_delim_ = true; + } else { + count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); + delim_="."; + } + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + + if (is_key_hex_) { + if (!null_from_) { + from_ = HexToString(from_); + } + if (!null_to_) { + to_ = HexToString(to_); + } + } +} + +void DBDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBDumperCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_COUNT_DELIM + "=]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append(" [--" + ARG_TTL_BUCKET + "=]"); + ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append("\n"); +} + +void DBDumperCommand::DoCommand() { + if (!db_) { + return; + } + // Parse command line args + uint64_t count = 0; + if (print_stats_) { + string stats; + if (db_->GetProperty("rocksdb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Setup key iterator + Iterator* iter = db_->NewIterator(ReadOptions()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error." + + st.ToString()); + } + + if (!null_from_) { + iter->Seek(from_); + } else { + iter->SeekToFirst(); + } + + int max_keys = max_keys_; + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTL::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTL::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete iter; + return; + } + int time_range = ttl_end - ttl_start; + int bucket_size; + if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) || + bucket_size <= 0) { + bucket_size = time_range; // Will have just 1 bucket by default + } + //cretaing variables for row count of each type + string rtype1,rtype2,row,val; + rtype2 = ""; + uint64_t c=0; + uint64_t s1=0,s2=0; + + // At this point, bucket_size=0 => time_range=0 + uint64_t num_buckets = (bucket_size >= time_range) ? 1 : + ((time_range + bucket_size - 1) / bucket_size); + vector bucket_counts(num_buckets, 0); + if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) { + fprintf(stdout, "Dumping key-values from %s to %s\n", + ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + } + + for (; iter->Valid(); iter->Next()) { + int rawtime = 0; + // If end marker was specified, we stop before it + if (!null_to_ && (iter->key().ToString() >= to_)) + break; + // Terminate if maximum number of keys have been dumped + if (max_keys == 0) + break; + if (is_db_ttl_) { + TtlIterator* it_ttl = dynamic_cast(iter); + assert(it_ttl); + rawtime = it_ttl->timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + } + if (max_keys > 0) { + --max_keys; + } + if (is_db_ttl_ && num_buckets > 1) { + IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size, + rawtime, num_buckets); + } + ++count; + if (count_delim_) { + rtype1 = ""; + row = iter->key().ToString(); + val = iter->value().ToString(); + s1 = row.size()+val.size(); + for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++) + rtype1+=row[j]; + if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long )c,(long long)s2); + c=1; + s2=s1; + rtype2 = rtype1; + } else { + c++; + s2+=s1; + rtype2=rtype1; + } + + } + + + + if (!count_only_ && !count_delim_) { + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + } + string str = PrintKeyValue(iter->key().ToString(), + iter->value().ToString(), is_key_hex_, + is_value_hex_); + fprintf(stdout, "%s\n", str.c_str()); + } + } + + if (num_buckets > 1 && is_db_ttl_) { + PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size, + num_buckets); + } else if(count_delim_) { + fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(), + (long long )c,(long long)s2); + } else { + fprintf(stdout, "Keys in range: %lld\n", (long long) count); + } + // Clean up + delete iter; +} + +const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels"; +const string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels"; + +ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})), + old_levels_(1 << 16), + new_levels_(-1), + print_old_levels_(false) { + + + ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_); + print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS); + + if(new_levels_ <= 0) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n"); + } +} + +vector ReduceDBLevelsCommand::PrepareArgs(const string& db_path, + int new_levels, bool print_old_level) { + vector ret; + ret.push_back("reduce_levels"); + ret.push_back("--" + ARG_DB + "=" + db_path); + ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels)); + if(print_old_level) { + ret.push_back("--" + ARG_PRINT_OLD_LEVELS); + } + return ret; +} + +void ReduceDBLevelsCommand::Help(string& ret) { + ret.append(" "); + ret.append(ReduceDBLevelsCommand::Name()); + ret.append(" --" + ARG_NEW_LEVELS + "="); + ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]"); + ret.append("\n"); +} + +Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.num_levels = old_levels_; + opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1); + // Disable size compaction + opt.max_bytes_for_level_base = 1ULL << 50; + opt.max_bytes_for_level_multiplier = 1; + opt.max_mem_compaction_level = 0; + return opt; +} + +Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, + int* levels) { + EnvOptions soptions; + TableCache tc(db_path_, &opt, soptions, 10); + const InternalKeyComparator cmp(opt.comparator); + VersionSet versions(db_path_, &opt, soptions, &tc, &cmp); + // We rely the VersionSet::Recover to tell us the internal data structures + // in the db. And the Recover() should never do any change + // (like LogAndApply) to the manifest file. + Status st = versions.Recover(); + if (!st.ok()) { + return st; + } + int max = -1; + for (int i = 0; i < versions.NumberLevels(); i++) { + if (versions.current()->NumLevelFiles(i)) { + max = i; + } + } + + *levels = max + 1; + return st; +} + +void ReduceDBLevelsCommand::DoCommand() { + if (new_levels_ <= 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Invalid number of levels.\n"); + return; + } + + Status st; + Options opt = PrepareOptionsForOpenDB(); + int old_level_num = -1; + st = GetOldNumOfLevels(opt, &old_level_num); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } + + if (print_old_levels_) { + fprintf(stdout, "The old number of levels in use is %d\n", old_level_num); + } + + if (old_level_num <= new_levels_) { + return; + } + + old_levels_ = old_level_num; + + OpenDB(); + if (!db_) { + return; + } + // Compact the whole DB to put all files to the highest level. + fprintf(stdout, "Compacting the db...\n"); + db_->CompactRange(nullptr, nullptr); + CloseDB(); + + EnvOptions soptions; + + st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + return; + } +} + +const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE = + "old_compaction_style"; +const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE = + "new_compaction_style"; + +ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( + const vector& params, const map& options, + const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE, + ARG_NEW_COMPACTION_STYLE})), + old_compaction_style_(-1), + new_compaction_style_(-1) { + + ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_, + exec_state_); + if (old_compaction_style_ != kCompactionStyleLevel && + old_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_, + exec_state_); + if (new_compaction_style_ != kCompactionStyleLevel && + new_compaction_style_ != kCompactionStyleUniversal) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " + + "style. Check ldb help for proper compaction style value.\n"); + return; + } + + if (new_compaction_style_ == old_compaction_style_) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Old compaction style is the same as new compaction style. " + "Nothing to do.\n"); + return; + } + + if (old_compaction_style_ == kCompactionStyleUniversal && + new_compaction_style_ == kCompactionStyleLevel) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Convert from universal compaction to level compaction. " + "Nothing to do.\n"); + return; + } +} + +void ChangeCompactionStyleCommand::Help(string& ret) { + ret.append(" "); + ret.append(ChangeCompactionStyleCommand::Name()); + ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "="); + ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "="); + ret.append("\n"); +} + +Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + + if (old_compaction_style_ == kCompactionStyleLevel && + new_compaction_style_ == kCompactionStyleUniversal) { + // In order to convert from level compaction to universal compaction, we + // need to compact all data into a single file and move it to level 0. + opt.disable_auto_compactions = true; + opt.target_file_size_base = INT_MAX; + opt.target_file_size_multiplier = 1; + opt.max_bytes_for_level_base = INT_MAX; + opt.max_bytes_for_level_multiplier = 1; + } + + return opt; +} + +void ChangeCompactionStyleCommand::DoCommand() { + // print db stats before we have made any change + std::string property; + std::string files_per_level; + for (int i = 0; i < db_->NumberLevels(); i++) { + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + } + fprintf(stdout, "files per level before compaction: %s\n", + files_per_level.c_str()); + + // manual compact into a single file and move the file to level 0 + db_->CompactRange(nullptr, nullptr, + true /* reduce level */, + 0 /* reduce to level 0 */); + + // verify compaction result + files_per_level = ""; + int num_files = 0; + for (int i = 0; i < db_->NumberLevels(); i++) { + db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + &property); + + // format print string + char buf[100]; + snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str()); + files_per_level += buf; + + num_files = atoi(property.c_str()); + + // level 0 should have only 1 file + if (i == 0 && num_files != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " + "level 0 after compaction is " + std::to_string(num_files) + + ", not 1.\n"); + return; + } + // other levels should have no file + if (i > 0 && num_files != 0) { + exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " + "level " + std::to_string(i) + " after compaction is " + + std::to_string(num_files) + ", not 0.\n"); + return; + } + } + + fprintf(stdout, "files per level after compaction: %s\n", + files_per_level.c_str()); +} + +class InMemoryHandler : public WriteBatch::Handler { + public: + InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) { + print_values_ = print_values; + } + + void commonPutMerge(const Slice& key, const Slice& value) { + string k = LDBCommand::StringToHex(key.ToString()); + if (print_values_) { + string v = LDBCommand::StringToHex(value.ToString()); + row_ << k << " : "; + row_ << v << " "; + } else { + row_ << k << " "; + } + } + + virtual void Put(const Slice& key, const Slice& value) { + row_ << "PUT : "; + commonPutMerge(key, value); + } + + virtual void Merge(const Slice& key, const Slice& value) { + row_ << "MERGE : "; + commonPutMerge(key, value); + } + + virtual void Delete(const Slice& key) { + row_ <<",DELETE : "; + row_ << LDBCommand::StringToHex(key.ToString()) << " "; + } + + virtual ~InMemoryHandler() { }; + + private: + stringstream & row_; + bool print_values_; +}; + +const string WALDumperCommand::ARG_WAL_FILE = "walfile"; +const string WALDumperCommand::ARG_PRINT_VALUE = "print_value"; +const string WALDumperCommand::ARG_PRINT_HEADER = "header"; + +WALDumperCommand::WALDumperCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})), + print_header_(false), print_values_(false) { + + wal_file_.clear(); + + map::const_iterator itr = options.find(ARG_WAL_FILE); + if (itr != options.end()) { + wal_file_ = itr->second; + } + + + print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER); + print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE); + if (wal_file_.empty()) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Argument " + ARG_WAL_FILE + " must be specified."); + } +} + +void WALDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(WALDumperCommand::Name()); + ret.append(" --" + ARG_WAL_FILE + "="); + ret.append(" [--" + ARG_PRINT_HEADER + "] "); + ret.append(" [--" + ARG_PRINT_VALUE + "] "); + ret.append("\n"); +} + +void WALDumperCommand::DoCommand() { + struct StdErrReporter : public log::Reader::Reporter { + virtual void Corruption(size_t bytes, const Status& s) { + cerr<<"Corruption detected in log file "< file; + Env* env_ = Env::Default(); + EnvOptions soptions; + Status status = env_->NewSequentialFile(wal_file_, &file, soptions); + if (!status.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " + + status.ToString()); + } else { + StdErrReporter reporter; + log::Reader reader(move(file), &reporter, true, 0); + string scratch; + WriteBatch batch; + Slice record; + stringstream row; + if (print_header_) { + cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)"; + if (print_values_) { + cout << " : value "; + } + cout << "\n"; + } + while(reader.ReadRecord(&record, &scratch)) { + row.str(""); + if (record.size() < 12) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + } else { + WriteBatchInternal::SetContents(&batch, record); + row<& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX, + ARG_KEY_HEX, + ARG_VALUE_HEX})) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " must be specified for the get command"); + } else { + key_ = params.at(0); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } +} + +void GetCommand::Help(string& ret) { + ret.append(" "); + ret.append(GetCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void GetCommand::DoCommand() { + string value; + Status st = db_->Get(ReadOptions(), key_, &value); + if (st.ok()) { + fprintf(stdout, "%s\n", + (is_value_hex_ ? StringToHex(value) : value).c_str()); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + + +ApproxSizeCommand::ApproxSizeCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO})) { + + if (options.find(ARG_FROM) != options.end()) { + start_key_ = options.find(ARG_FROM)->second; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_FROM + + " must be specified for approxsize command"); + return; + } + + if (options.find(ARG_TO) != options.end()) { + end_key_ = options.find(ARG_TO)->second; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_TO + + " must be specified for approxsize command"); + return; + } + + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + end_key_ = HexToString(end_key_); + } +} + +void ApproxSizeCommand::Help(string& ret) { + ret.append(" "); + ret.append(ApproxSizeCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append("\n"); +} + +void ApproxSizeCommand::DoCommand() { + + Range ranges[1]; + ranges[0] = Range(start_key_, end_key_); + uint64_t sizes[1]; + db_->GetApproximateSizes(ranges, 1, sizes); + fprintf(stdout, "%lu\n", (unsigned long)sizes[0]); + /* Weird that GetApproximateSizes() returns void, although documentation + * says that it returns a Status object. + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } + */ +} + + +BatchPutCommand::BatchPutCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_CREATE_IF_MISSING})) { + + if (params.size() < 2) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "At least one pair must be specified batchput."); + } else if (params.size() % 2 != 0) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "Equal number of s and s must be specified for batchput."); + } else { + for (size_t i = 0; i < params.size(); i += 2) { + string key = params.at(i); + string value = params.at(i+1); + key_values_.push_back(pair( + is_key_hex_ ? HexToString(key) : key, + is_value_hex_ ? HexToString(value) : value)); + } + } +} + +void BatchPutCommand::Help(string& ret) { + ret.append(" "); + ret.append(BatchPutCommand::Name()); + ret.append(" [ ] [..]"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void BatchPutCommand::DoCommand() { + WriteBatch batch; + + for (vector>::const_iterator itr + = key_values_.begin(); itr != key_values_.end(); itr++) { + batch.Put(itr->first, itr->second); + } + Status st = db_->Write(WriteOptions(), &batch); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +Options BatchPutCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); + return opt; +} + + +ScanCommand::ScanCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO, + ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, + ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})), + start_key_specified_(false), + end_key_specified_(false), + max_keys_scanned_(-1) { + + map::const_iterator itr = options.find(ARG_FROM); + if (itr != options.end()) { + start_key_ = itr->second; + if (is_key_hex_) { + start_key_ = HexToString(start_key_); + } + start_key_specified_ = true; + } + itr = options.find(ARG_TO); + if (itr != options.end()) { + end_key_ = itr->second; + if (is_key_hex_) { + end_key_ = HexToString(end_key_); + } + end_key_specified_ = true; + } + + itr = options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { + max_keys_scanned_ = stoi(itr->second); + } catch(const invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has an invalid value"); + } catch(const out_of_range&) { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS + + " has a value out-of-range"); + } + } +} + +void ScanCommand::Help(string& ret) { + ret.append(" "); + ret.append(ScanCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append(" [--" + ARG_TIMESTAMP + "]"); + ret.append(" [--" + ARG_MAX_KEYS + "=q] "); + ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); + ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append("\n"); +} + +void ScanCommand::DoCommand() { + + int num_keys_scanned = 0; + Iterator* it = db_->NewIterator(ReadOptions()); + if (start_key_specified_) { + it->Seek(start_key_); + } else { + it->SeekToFirst(); + } + int ttl_start; + if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) { + ttl_start = DBWithTTL::kMinTimestamp; // TTL introduction time + } + int ttl_end; + if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) { + ttl_end = DBWithTTL::kMaxTimestamp; // Max time allowed by TTL feature + } + if (ttl_end < ttl_start) { + fprintf(stderr, "Error: End time can't be less than start time\n"); + delete it; + return; + } + if (is_db_ttl_ && timestamp_) { + fprintf(stdout, "Scanning key-values from %s to %s\n", + ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str()); + } + for ( ; + it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); + it->Next()) { + string key = it->key().ToString(); + if (is_db_ttl_) { + TtlIterator* it_ttl = dynamic_cast(it); + assert(it_ttl); + int rawtime = it_ttl->timestamp(); + if (rawtime < ttl_start || rawtime >= ttl_end) { + continue; + } + if (timestamp_) { + fprintf(stdout, "%s ", ReadableTime(rawtime).c_str()); + } + } + string value = it->value().ToString(); + fprintf(stdout, "%s : %s\n", + (is_key_hex_ ? StringToHex(key) : key).c_str(), + (is_value_hex_ ? StringToHex(value) : value).c_str() + ); + num_keys_scanned++; + if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) { + break; + } + } + if (!it->status().ok()) { // Check for any errors found during the scan + exec_state_ = LDBCommandExecuteResult::FAILED(it->status().ToString()); + } + delete it; +} + + +DeleteCommand::DeleteCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "KEY must be specified for the delete command"); + } else { + key_ = params.at(0); + if (is_key_hex_) { + key_ = HexToString(key_); + } + } +} + +void DeleteCommand::Help(string& ret) { + ret.append(" "); + ret.append(DeleteCommand::Name() + " "); + ret.append("\n"); +} + +void DeleteCommand::DoCommand() { + Status st = db_->Delete(WriteOptions(), key_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + + +PutCommand::PutCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_CREATE_IF_MISSING})) { + + if (params.size() != 2) { + exec_state_ = LDBCommandExecuteResult::FAILED( + " and must be specified for the put command"); + } else { + key_ = params.at(0); + value_ = params.at(1); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } + + if (is_value_hex_) { + value_ = HexToString(value_); + } +} + +void PutCommand::Help(string& ret) { + ret.append(" "); + ret.append(PutCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void PutCommand::DoCommand() { + Status st = db_->Put(WriteOptions(), key_, value_); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +Options PutCommand::PrepareOptionsForOpenDB() { + Options opt = LDBCommand::PrepareOptionsForOpenDB(); + opt.create_if_missing = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); + return opt; +} + + +const char* DBQuerierCommand::HELP_CMD = "help"; +const char* DBQuerierCommand::GET_CMD = "get"; +const char* DBQuerierCommand::PUT_CMD = "put"; +const char* DBQuerierCommand::DELETE_CMD = "delete"; + +DBQuerierCommand::DBQuerierCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX})) { + +} + +void DBQuerierCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBQuerierCommand::Name()); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); + ret.append(" Starts a REPL shell. Type help for list of available " + "commands."); + ret.append("\n"); +} + +void DBQuerierCommand::DoCommand() { + if (!db_) { + return; + } + + ReadOptions read_options; + WriteOptions write_options; + + string line; + string key; + string value; + while (getline(cin, line, '\n')) { + + // Parse line into vector + vector tokens; + size_t pos = 0; + while (true) { + size_t pos2 = line.find(' ', pos); + if (pos2 == string::npos) { + break; + } + tokens.push_back(line.substr(pos, pos2-pos)); + pos = pos2 + 1; + } + tokens.push_back(line.substr(pos)); + + const string& cmd = tokens[0]; + + if (cmd == HELP_CMD) { + fprintf(stdout, + "get \n" + "put \n" + "delete \n"); + } else if (cmd == DELETE_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + db_->Delete(write_options, Slice(key)); + fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str()); + } else if (cmd == PUT_CMD && tokens.size() == 3) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]); + db_->Put(write_options, Slice(key), Slice(value)); + fprintf(stdout, "Successfully put %s %s\n", + tokens[1].c_str(), tokens[2].c_str()); + } else if (cmd == GET_CMD && tokens.size() == 2) { + key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); + if (db_->Get(read_options, Slice(key), &value).ok()) { + fprintf(stdout, "%s\n", PrintKeyValue(key, value, + is_key_hex_, is_value_hex_).c_str()); + } else { + fprintf(stdout, "Not found %s\n", tokens[1].c_str()); + } + } else { + fprintf(stdout, "Unknown command %s\n", line.c_str()); + } + } +} + +CheckConsistencyCommand::CheckConsistencyCommand(const vector& params, + const map& options, const vector& flags) : + LDBCommand(options, flags, false, + BuildCmdLineOptions({})) { +} + +void CheckConsistencyCommand::Help(string& ret) { + ret.append(" "); + ret.append(CheckConsistencyCommand::Name()); + ret.append("\n"); +} + +void CheckConsistencyCommand::DoCommand() { + Options opt = PrepareOptionsForOpenDB(); + opt.paranoid_checks = true; + if (!exec_state_.IsNotStarted()) { + return; + } + DB* db; + Status st = DB::OpenForReadOnly(opt, db_path_, &db, false); + delete db; + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); + } +} + +} // namespace rocksdb diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h new file mode 100644 index 00000000..17343bb1 --- /dev/null +++ b/util/ldb_cmd.h @@ -0,0 +1,705 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include +#include +#include +#include +#include +#include + +#include "db/version_set.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "util/logging.h" +#include "util/ldb_cmd_execute_result.h" +#include "util/string_util.h" +#include "utilities/utility_db.h" +#include "utilities/ttl/db_ttl.h" + +using std::string; +using std::map; +using std::vector; +using std::ostringstream; + +namespace rocksdb { + +class LDBCommand { +public: + + // Command-line arguments + static const string ARG_DB; + static const string ARG_HEX; + static const string ARG_KEY_HEX; + static const string ARG_VALUE_HEX; + static const string ARG_TTL; + static const string ARG_TTL_START; + static const string ARG_TTL_END; + static const string ARG_TIMESTAMP; + static const string ARG_FROM; + static const string ARG_TO; + static const string ARG_MAX_KEYS; + static const string ARG_BLOOM_BITS; + static const string ARG_COMPRESSION_TYPE; + static const string ARG_BLOCK_SIZE; + static const string ARG_AUTO_COMPACTION; + static const string ARG_WRITE_BUFFER_SIZE; + static const string ARG_FILE_SIZE; + static const string ARG_CREATE_IF_MISSING; + + static LDBCommand* InitFromCmdLineArgs( + const vector& args, + const Options& options = Options() + ); + + static LDBCommand* InitFromCmdLineArgs( + int argc, + char** argv, + const Options& options = Options() + ); + + bool ValidateCmdLineOptions(); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void SetOptions(Options options) { + options_ = options; + } + + virtual bool NoDBOpen() { + return false; + } + + virtual ~LDBCommand() { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + /* Run the command, and return the execute result. */ + void Run() { + if (!exec_state_.IsNotStarted()) { + return; + } + + if (db_ == nullptr && !NoDBOpen()) { + OpenDB(); + if (!exec_state_.IsNotStarted()) { + return; + } + } + + DoCommand(); + if (exec_state_.IsNotStarted()) { + exec_state_ = LDBCommandExecuteResult::SUCCEED(""); + } + + if (db_ != nullptr) { + CloseDB (); + } + } + + virtual void DoCommand() = 0; + + LDBCommandExecuteResult GetExecuteState() { + return exec_state_; + } + + void ClearPreviousRunState() { + exec_state_.Reset(); + } + + static string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; + } + + static string StringToHex(const string& str) { + string result = "0x"; + char buf[10]; + for (size_t i = 0; i < str.length(); i++) { + snprintf(buf, 10, "%02X", (unsigned char)str[i]); + result += buf; + } + return result; + } + + static const char* DELIM; + +protected: + + LDBCommandExecuteResult exec_state_; + string db_path_; + DB* db_; + StackableDB* sdb_; + + /** + * true implies that this command can work if the db is opened in read-only + * mode. + */ + bool is_read_only_; + + /** If true, the key is input/output as hex in get/put/scan/delete etc. */ + bool is_key_hex_; + + /** If true, the value is input/output as hex in get/put/scan/delete etc. */ + bool is_value_hex_; + + /** If true, the value is treated as timestamp suffixed */ + bool is_db_ttl_; + + // If true, the kvs are output with their insert/modify timestamp in a ttl db + bool timestamp_; + + /** + * Map of options passed on the command-line. + */ + const map option_map_; + + /** + * Flags passed on the command-line. + */ + const vector flags_; + + /** List of command-line options valid for this command */ + const vector valid_cmd_line_options_; + + bool ParseKeyValue(const string& line, string* key, string* value, + bool is_key_hex, bool is_value_hex); + + LDBCommand(const map& options, const vector& flags, + bool is_read_only, const vector& valid_cmd_line_options) : + db_(nullptr), + is_read_only_(is_read_only), + is_key_hex_(false), + is_value_hex_(false), + is_db_ttl_(false), + timestamp_(false), + option_map_(options), + flags_(flags), + valid_cmd_line_options_(valid_cmd_line_options) { + + map::const_iterator itr = options.find(ARG_DB); + if (itr != options.end()) { + db_path_ = itr->second; + } + + is_key_hex_ = IsKeyHex(options, flags); + is_value_hex_ = IsValueHex(options, flags); + is_db_ttl_ = IsFlagPresent(flags, ARG_TTL); + timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP); + } + + void OpenDB() { + Options opt = PrepareOptionsForOpenDB(); + if (!exec_state_.IsNotStarted()) { + return; + } + // Open the DB. + Status st; + if (is_db_ttl_) { + if (is_read_only_) { + st = UtilityDB::OpenTtlDB(opt, db_path_, &sdb_, 0, true); + } else { + st = UtilityDB::OpenTtlDB(opt, db_path_, &sdb_); + } + db_ = sdb_; + } else if (is_read_only_) { + st = DB::OpenForReadOnly(opt, db_path_, &db_); + } else { + st = DB::Open(opt, db_path_, &db_); + } + if (!st.ok()) { + string msg = st.ToString(); + exec_state_ = LDBCommandExecuteResult::FAILED(msg); + } + + options_ = opt; + } + + void CloseDB () { + if (db_ != nullptr) { + delete db_; + db_ = nullptr; + } + } + + static string PrintKeyValue(const string& key, const string& value, + bool is_key_hex, bool is_value_hex) { + string result; + result.append(is_key_hex ? StringToHex(key) : key); + result.append(DELIM); + result.append(is_value_hex ? StringToHex(value) : value); + return result; + } + + static string PrintKeyValue(const string& key, const string& value, + bool is_hex) { + return PrintKeyValue(key, value, is_hex, is_hex); + } + + /** + * Return true if the specified flag is present in the specified flags vector + */ + static bool IsFlagPresent(const vector& flags, const string& flag) { + return (std::find(flags.begin(), flags.end(), flag) != flags.end()); + } + + static string HelpRangeCmdArgs() { + ostringstream str_stream; + str_stream << " "; + str_stream << "[--" << ARG_FROM << "] "; + str_stream << "[--" << ARG_TO << "] "; + return str_stream.str(); + } + + /** + * A helper function that returns a list of command line options + * used by this command. It includes the common options and the ones + * passed in. + */ + vector BuildCmdLineOptions(vector options) { + vector ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE, + ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE, + ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE}; + ret.insert(ret.end(), options.begin(), options.end()); + return ret; + } + + bool ParseIntOption(const map& options, const string& option, + int& value, LDBCommandExecuteResult& exec_state); + + bool ParseStringOption(const map& options, + const string& option, string* value); + + Options options_; + +private: + + /** + * Interpret command line options and flags to determine if the key + * should be input/output in hex. + */ + bool IsKeyHex(const map& options, + const vector& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_KEY_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_KEY_HEX, false)); + } + + /** + * Interpret command line options and flags to determine if the value + * should be input/output in hex. + */ + bool IsValueHex(const map& options, + const vector& flags) { + return (IsFlagPresent(flags, ARG_HEX) || + IsFlagPresent(flags, ARG_VALUE_HEX) || + ParseBooleanOption(options, ARG_HEX, false) || + ParseBooleanOption(options, ARG_VALUE_HEX, false)); + } + + /** + * Returns the value of the specified option as a boolean. + * default_val is used if the option is not found in options. + * Throws an exception if the value of the option is not + * "true" or "false" (case insensitive). + */ + bool ParseBooleanOption(const map& options, + const string& option, bool default_val) { + + map::const_iterator itr = options.find(option); + if (itr != options.end()) { + string option_val = itr->second; + return StringToBool(itr->second); + } + return default_val; + } + + /** + * Converts val to a boolean. + * val must be either true or false (case insensitive). + * Otherwise an exception is thrown. + */ + bool StringToBool(string val) { + std::transform(val.begin(), val.end(), val.begin(), ::tolower); + if (val == "true") { + return true; + } else if (val == "false") { + return false; + } else { + throw "Invalid value for boolean argument"; + } + } + + static LDBCommand* SelectCommand( + const string& cmd, + const vector& cmdParams, + const map& option_map, + const vector& flags + ); + +}; + +class CompactorCommand: public LDBCommand { +public: + static string Name() { return "compact"; } + + CompactorCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool null_from_; + string from_; + bool null_to_; + string to_; +}; + +class DBDumperCommand: public LDBCommand { +public: + static string Name() { return "dump"; } + + DBDumperCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool null_from_; + string from_; + bool null_to_; + string to_; + int max_keys_; + string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + + static const string ARG_COUNT_ONLY; + static const string ARG_COUNT_DELIM; + static const string ARG_STATS; + static const string ARG_TTL_BUCKET; +}; + +class InternalDumpCommand: public LDBCommand { +public: + static string Name() { return "idump"; } + + InternalDumpCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool has_from_; + string from_; + bool has_to_; + string to_; + int max_keys_; + string delim_; + bool count_only_; + bool count_delim_; + bool print_stats_; + bool is_input_key_hex_; + + static const string ARG_DELIM; + static const string ARG_COUNT_ONLY; + static const string ARG_COUNT_DELIM; + static const string ARG_STATS; + static const string ARG_INPUT_KEY_HEX; +}; + +class DBLoaderCommand: public LDBCommand { +public: + static string Name() { return "load"; } + + DBLoaderCommand(string& db_name, vector& args); + + DBLoaderCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual Options PrepareOptionsForOpenDB(); + +private: + bool create_if_missing_; + bool disable_wal_; + bool bulk_load_; + bool compact_; + + static const string ARG_DISABLE_WAL; + static const string ARG_BULK_LOAD; + static const string ARG_COMPACT; +}; + +class ManifestDumpCommand: public LDBCommand { +public: + static string Name() { return "manifest_dump"; } + + ManifestDumpCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + +private: + bool verbose_; + string path_; + + static const string ARG_VERBOSE; + static const string ARG_PATH; +}; + +class ReduceDBLevelsCommand : public LDBCommand { +public: + static string Name() { return "reduce_levels"; } + + ReduceDBLevelsCommand(const vector& params, + const map& options, const vector& flags); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& msg); + + static vector PrepareArgs(const string& db_path, int new_levels, + bool print_old_level = false); + +private: + int old_levels_; + int new_levels_; + bool print_old_levels_; + + static const string ARG_NEW_LEVELS; + static const string ARG_PRINT_OLD_LEVELS; + + Status GetOldNumOfLevels(Options& opt, int* levels); +}; + +class ChangeCompactionStyleCommand : public LDBCommand { +public: + static string Name() { return "change_compaction_style"; } + + ChangeCompactionStyleCommand(const vector& params, + const map& options, const vector& flags); + + virtual Options PrepareOptionsForOpenDB(); + + virtual void DoCommand(); + + static void Help(string& msg); + +private: + int old_compaction_style_; + int new_compaction_style_; + + static const string ARG_OLD_COMPACTION_STYLE; + static const string ARG_NEW_COMPACTION_STYLE; +}; + +class WALDumperCommand : public LDBCommand { +public: + static string Name() { return "dump_wal"; } + + WALDumperCommand(const vector& params, + const map& options, const vector& flags); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& ret); + virtual void DoCommand(); + +private: + bool print_header_; + string wal_file_; + bool print_values_; + + static const string ARG_WAL_FILE; + static const string ARG_PRINT_HEADER; + static const string ARG_PRINT_VALUE; +}; + + +class GetCommand : public LDBCommand { +public: + static string Name() { return "get"; } + + GetCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string key_; +}; + +class ApproxSizeCommand : public LDBCommand { +public: + static string Name() { return "approxsize"; } + + ApproxSizeCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string start_key_; + string end_key_; +}; + +class BatchPutCommand : public LDBCommand { +public: + static string Name() { return "batchput"; } + + BatchPutCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + + virtual Options PrepareOptionsForOpenDB(); + +private: + /** + * The key-values to be inserted. + */ + vector> key_values_; +}; + +class ScanCommand : public LDBCommand { +public: + static string Name() { return "scan"; } + + ScanCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string start_key_; + string end_key_; + bool start_key_specified_; + bool end_key_specified_; + int max_keys_scanned_; +}; + +class DeleteCommand : public LDBCommand { +public: + static string Name() { return "delete"; } + + DeleteCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + +private: + string key_; +}; + +class PutCommand : public LDBCommand { +public: + static string Name() { return "put"; } + + PutCommand(const vector& params, const map& options, + const vector& flags); + + virtual void DoCommand(); + + static void Help(string& ret); + + virtual Options PrepareOptionsForOpenDB(); + +private: + string key_; + string value_; +}; + +/** + * Command that starts up a REPL shell that allows + * get/put/delete. + */ +class DBQuerierCommand: public LDBCommand { +public: + static string Name() { return "query"; } + + DBQuerierCommand(const vector& params, + const map& options, const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + static const char* HELP_CMD; + static const char* GET_CMD; + static const char* PUT_CMD; + static const char* DELETE_CMD; +}; + +class CheckConsistencyCommand : public LDBCommand { +public: + static string Name() { return "checkconsistency"; } + + CheckConsistencyCommand(const vector& params, + const map& options, const vector& flags); + + virtual void DoCommand(); + + virtual bool NoDBOpen() { + return true; + } + + static void Help(string& ret); +}; + +} // namespace rocksdb diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h new file mode 100644 index 00000000..b9121b2b --- /dev/null +++ b/util/ldb_cmd_execute_result.h @@ -0,0 +1,76 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +namespace rocksdb { + +class LDBCommandExecuteResult { +public: + enum State { + EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2, + }; + + LDBCommandExecuteResult() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + LDBCommandExecuteResult(State state, std::string& msg) { + state_ = state; + message_ = msg; + } + + std::string ToString() { + std::string ret; + switch (state_) { + case EXEC_SUCCEED: + break; + case EXEC_FAILED: + ret.append("Failed: "); + break; + case EXEC_NOT_STARTED: + ret.append("Not started: "); + } + if (!message_.empty()) { + ret.append(message_); + } + return ret; + } + + void Reset() { + state_ = EXEC_NOT_STARTED; + message_ = ""; + } + + bool IsSucceed() { + return state_ == EXEC_SUCCEED; + } + + bool IsNotStarted() { + return state_ == EXEC_NOT_STARTED; + } + + bool IsFailed() { + return state_ == EXEC_FAILED; + } + + static LDBCommandExecuteResult SUCCEED(std::string msg) { + return LDBCommandExecuteResult(EXEC_SUCCEED, msg); + } + + static LDBCommandExecuteResult FAILED(std::string msg) { + return LDBCommandExecuteResult(EXEC_FAILED, msg); + } + +private: + State state_; + std::string message_; + + bool operator==(const LDBCommandExecuteResult&); + bool operator!=(const LDBCommandExecuteResult&); +}; + +} diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc new file mode 100644 index 00000000..10d5a1fa --- /dev/null +++ b/util/ldb_tool.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/ldb_tool.h" +#include "util/ldb_cmd.h" + +namespace rocksdb { + +class LDBCommandRunner { +public: + + static void PrintHelp(const char* exec_name) { + string ret; + + ret.append("ldb - LevelDB Tool"); + ret.append("\n\n"); + ret.append("commands MUST specify --" + LDBCommand::ARG_DB + + "= when necessary\n"); + ret.append("\n"); + ret.append("The following optional parameters control if keys/values are " + "input/output as hex or as plain strings:\n"); + ret.append(" --" + LDBCommand::ARG_KEY_HEX + + " : Keys are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_VALUE_HEX + + " : Values are input/output as hex\n"); + ret.append(" --" + LDBCommand::ARG_HEX + + " : Both keys and values are input/output as hex\n"); + ret.append("\n"); + + ret.append("The following optional parameters control the database " + "internals:\n"); + ret.append(" --" + LDBCommand::ARG_TTL + + " with 'put','get','scan','dump','query','batchput'" + " : DB supports ttl and value is internally timestamp-suffixed\n"); + ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=\n"); + ret.append(" --" + LDBCommand::ARG_COMPRESSION_TYPE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_BLOCK_SIZE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_AUTO_COMPACTION + "=\n"); + ret.append(" --" + LDBCommand::ARG_WRITE_BUFFER_SIZE + + "=\n"); + ret.append(" --" + LDBCommand::ARG_FILE_SIZE + "=\n"); + + ret.append("\n\n"); + ret.append("Data Access Commands:\n"); + PutCommand::Help(ret); + GetCommand::Help(ret); + BatchPutCommand::Help(ret); + ScanCommand::Help(ret); + DeleteCommand::Help(ret); + DBQuerierCommand::Help(ret); + ApproxSizeCommand::Help(ret); + CheckConsistencyCommand::Help(ret); + + ret.append("\n\n"); + ret.append("Admin Commands:\n"); + WALDumperCommand::Help(ret); + CompactorCommand::Help(ret); + ReduceDBLevelsCommand::Help(ret); + ChangeCompactionStyleCommand::Help(ret); + DBDumperCommand::Help(ret); + DBLoaderCommand::Help(ret); + ManifestDumpCommand::Help(ret); + InternalDumpCommand::Help(ret); + + fprintf(stderr, "%s\n", ret.c_str()); + } + + static void RunCommand(int argc, char** argv, Options options) { + if (argc <= 2) { + PrintHelp(argv[0]); + exit(1); + } + + LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options); + if (cmdObj == nullptr) { + fprintf(stderr, "Unknown command\n"); + PrintHelp(argv[0]); + exit(1); + } + + if (!cmdObj->ValidateCmdLineOptions()) { + exit(1); + } + + cmdObj->Run(); + LDBCommandExecuteResult ret = cmdObj->GetExecuteState(); + fprintf(stderr, "%s\n", ret.ToString().c_str()); + delete cmdObj; + + exit(ret.IsFailed()); + } + +}; + + +void LDBTool::Run(int argc, char** argv, Options options) { + LDBCommandRunner::RunCommand(argc, argv, options); +} +} // namespace rocksdb + diff --git a/util/log_buffer.cc b/util/log_buffer.cc new file mode 100644 index 00000000..726c0144 --- /dev/null +++ b/util/log_buffer.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/log_buffer.h" + +#include + +namespace rocksdb { + +LogBuffer::LogBuffer(const InfoLogLevel log_level, + Logger*info_log) + : log_level_(log_level), info_log_(info_log) {} + +void LogBuffer::AddLogToBuffer(const char* format, va_list ap) { + if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) { + // Skip the level because of its level. + return; + } + + const size_t kLogSizeLimit = 512; + char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit); + BufferedLog* buffered_log = new (alloc_mem) BufferedLog(); + char* p = buffered_log->message; + char* limit = alloc_mem + kLogSizeLimit - 1; + + // store the time + gettimeofday(&(buffered_log->now_tv), nullptr); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + auto n = vsnprintf(p, limit - p, format, backup_ap); + assert(n >= 0); + p += n; + va_end(backup_ap); + } + + if (p > limit) { + p = limit; + } + + // Add '\0' to the end + *p = '\0'; + + logs_.push_back(buffered_log); +} + +void LogBuffer::FlushBufferToLog() { + for (BufferedLog* log : logs_) { + const time_t seconds = log->now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + Log(log_level_, info_log_, + "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, + t.tm_sec, static_cast(log->now_tv.tv_usec), log->message); + } + logs_.clear(); +} + +void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) { + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(format, ap); + va_end(ap); + } +} + +} // namespace rocksdb diff --git a/util/log_buffer.h b/util/log_buffer.h new file mode 100644 index 00000000..8ebe92e0 --- /dev/null +++ b/util/log_buffer.h @@ -0,0 +1,48 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "rocksdb/env.h" +#include "util/arena.h" +#include "util/autovector.h" + +namespace rocksdb { + +class Logger; + +// A class to buffer info log entries and flush them in the end. +class LogBuffer { + public: + // log_level: the log level for all the logs + // info_log: logger to write the logs to + LogBuffer(const InfoLogLevel log_level, Logger* info_log); + + // Add a log entry to the buffer. + void AddLogToBuffer(const char* format, va_list ap); + + size_t IsEmpty() const { return logs_.empty(); } + + // Flush all buffered log to the info log. + void FlushBufferToLog(); + + private: + // One log entry with its timestamp + struct BufferedLog { + struct timeval now_tv; // Timestamp of the log + char message[1]; // Beginning of log message + }; + + const InfoLogLevel log_level_; + Logger* info_log_; + Arena arena_; + autovector logs_; +}; + +// Add log to the LogBuffer for a delayed info logging. It can be used when +// we want to add some logs inside a mutex. +extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); + +} // namespace rocksdb diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc new file mode 100644 index 00000000..642d4e83 --- /dev/null +++ b/util/log_write_bench.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "rocksdb/env.h" +#include "util/histogram.h" +#include "util/testharness.h" +#include "util/testutil.h" + +// A simple benchmark to simulate transactional logs + +DEFINE_int32(num_records, 6000, "Size of each record."); +DEFINE_int32(record_size, 249, "Size of each record."); +DEFINE_int32(record_interval, 10000, "Interval between records (microSec)"); +DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions"); +DEFINE_bool(enable_sync, false, "sync after each write."); + +namespace rocksdb { +void RunBenchmark() { + std::string file_name = test::TmpDir() + "/log_write_bench.log"; + Env* env = Env::Default(); + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.bytes_per_sync = FLAGS_bytes_per_sync; + unique_ptr file; + env->NewWritableFile(file_name, &file, env_options); + + std::string record; + record.assign('X', FLAGS_record_size); + + HistogramImpl hist; + + uint64_t start_time = env->NowMicros(); + for (int i = 0; i < FLAGS_num_records; i++) { + uint64_t start_nanos = env->NowNanos(); + file->Append(record); + file->Flush(); + if (FLAGS_enable_sync) { + file->Sync(); + } + hist.Add(env->NowNanos() - start_nanos); + + if (i % 1000 == 1) { + fprintf(stderr, "Wrote %d records...\n", i); + } + + int time_to_sleep = + (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time); + if (time_to_sleep > 0) { + env->SleepForMicroseconds(time_to_sleep); + } + } + + fprintf(stderr, "Distribution of latency of append+flush: \n%s", + hist.ToString().c_str()); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + google::ParseCommandLineFlags(&argc, &argv, true); + + rocksdb::RunBenchmark(); + return 0; +} diff --git a/util/logging.cc b/util/logging.cc new file mode 100644 index 00000000..69734134 --- /dev/null +++ b/util/logging.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/logging.h" + +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToString(uint64_t num) { + std::string r; + AppendNumberTo(&r, num); + return r; +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeChar(Slice* in, char c) { + if (!in->empty() && (*in)[0] == c) { + in->remove_prefix(1); + return true; + } else { + return false; + } +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const unsigned int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64/10 || + (v == kMaxUint64/10 && delta > kMaxUint64%10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +} // namespace rocksdb diff --git a/util/logging.h b/util/logging.h new file mode 100644 index 00000000..411c83be --- /dev/null +++ b/util/logging.h @@ -0,0 +1,48 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Must not be included from any .h files to avoid polluting the namespace +// with macros. + +#pragma once +#include +#include +#include +#include "port/port.h" + +namespace rocksdb { + +class Slice; +class WritableFile; + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Return a human-readable printout of "num" +extern std::string NumberToString(uint64_t num); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// If *in starts with "c", advances *in past the first character and +// returns true. Otherwise, returns false. +extern bool ConsumeChar(Slice* in, char c); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +} // namespace rocksdb diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc new file mode 100644 index 00000000..dd615f05 --- /dev/null +++ b/util/manual_compaction_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Test for issue 178: a manual compaction causes deleted data to reappear. +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" +#include "rocksdb/write_batch.h" +#include "util/testharness.h" + +using namespace rocksdb; + +namespace { + +const int kNumKeys = 1100000; + +std::string Key1(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "my_key_%d", i); + return buf; +} + +std::string Key2(int i) { + return Key1(i) + "_xxx"; +} + +class ManualCompactionTest { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; + DestroyDB(dbname_, rocksdb::Options()); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const { + return existing_value.ToString() == "destroy"; + } + + virtual const char* Name() const { + return "DestroyAllCompactionFilter"; + } +}; + +TEST(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = rocksdb::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key3"), Slice("value3")); + db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + + Slice key4("key4"); + db->CompactRange(nullptr, &key4); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + DestroyDB(dbname_, options); + } +} + +TEST(ManualCompactionTest, Test) { + + // Open database. Disable compression since it affects the creation + // of layers and the code below is trying to test against a very + // specific scenario. + rocksdb::DB* db; + rocksdb::Options db_options; + db_options.create_if_missing = true; + db_options.compression = rocksdb::kNoCompression; + ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db)); + + // create first key range + rocksdb::WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key1(i), "value for range 1 key"); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // create second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key2(i), "value for range 2 key"); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // delete second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Delete(Key2(i)); + } + ASSERT_OK(db->Write(rocksdb::WriteOptions(), &batch)); + + // compact database + std::string start_key = Key1(0); + std::string end_key = Key1(kNumKeys - 1); + rocksdb::Slice least(start_key.data(), start_key.size()); + rocksdb::Slice greatest(end_key.data(), end_key.size()); + + // commenting out the line below causes the example to work correctly + db->CompactRange(&least, &greatest); + + // count the keys + rocksdb::Iterator* iter = db->NewIterator(rocksdb::ReadOptions()); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + delete iter; + ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; + + // close database + delete db; + DestroyDB(dbname_, rocksdb::Options()); +} + +} // anonymous namespace + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/murmurhash.cc b/util/murmurhash.cc new file mode 100644 index 00000000..d9d8b706 --- /dev/null +++ b/util/murmurhash.cc @@ -0,0 +1,183 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#include "murmurhash.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; + case 6: h ^= ((uint64_t)data2[5]) << 40; + case 5: h ^= ((uint64_t)data2[4]) << 32; + case 4: h ^= ((uint64_t)data2[3]) << 24; + case 3: h ^= ((uint64_t)data2[2]) << 16; + case 2: h ^= ((uint64_t)data2[1]) << 8; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. + +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#endif diff --git a/util/murmurhash.h b/util/murmurhash.h new file mode 100644 index 00000000..faa86556 --- /dev/null +++ b/util/murmurhash.h @@ -0,0 +1,42 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash is + under the MIT license. +*/ +#pragma once +#include +#include "rocksdb/slice.h" + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash64A +typedef uint64_t murmur_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHash2 +typedef unsigned int murmur_t; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); +#define MurmurHash MurmurHashNeutral2 +typedef unsigned int murmur_t; +#endif + +// Allow slice to be hashable by murmur hash. +namespace rocksdb { +struct murmur_hash { + size_t operator()(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0); + } +}; +} // rocksdb diff --git a/util/mutexlock.h b/util/mutexlock.h new file mode 100644 index 00000000..0f4e5c8b --- /dev/null +++ b/util/mutexlock.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "port/port.h" + +namespace rocksdb { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { + this->mu_->Lock(); + } + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; + // No copying allowed + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->ReadLock(); + } + ~ReadLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + ReadLock(const ReadLock&); + void operator=(const ReadLock&); +}; + + +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { + this->mu_->WriteLock(); + } + ~WriteLock() { this->mu_->Unlock(); } + + private: + port::RWMutex *const mu_; + // No copying allowed + WriteLock(const WriteLock&); + void operator=(const WriteLock&); +}; + +} // namespace rocksdb diff --git a/util/options.cc b/util/options.cc new file mode 100644 index 00000000..5c5bab55 --- /dev/null +++ b/util/options.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/options.h" + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" + +namespace rocksdb { + +Options::Options() + : comparator(BytewiseComparator()), + merge_operator(nullptr), + compaction_filter(nullptr), + compaction_filter_factory(std::shared_ptr( + new DefaultCompactionFilterFactory())), + compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(true), + env(Env::Default()), + info_log(nullptr), + info_log_level(INFO), + write_buffer_size(4 << 20), + max_write_buffer_number(2), + min_write_buffer_number_to_merge(1), + max_open_files(5000), + block_cache(nullptr), + block_cache_compressed(nullptr), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(nullptr), + prefix_extractor(nullptr), + whole_key_filtering(true), + num_levels(7), + level0_file_num_compaction_trigger(4), + level0_slowdown_writes_trigger(20), + level0_stop_writes_trigger(24), + max_mem_compaction_level(2), + target_file_size_base(2 * 1048576), + target_file_size_multiplier(1), + max_bytes_for_level_base(10 * 1048576), + max_bytes_for_level_multiplier(10), + max_bytes_for_level_multiplier_additional(num_levels, 1), + expanded_compaction_factor(25), + source_compaction_factor(1), + max_grandparent_overlap_factor(10), + disableDataSync(false), + use_fsync(false), + db_stats_log_interval(1800), + db_log_dir(""), + wal_dir(""), + disable_seek_compaction(true), + delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), + max_background_compactions(1), + max_background_flushes(1), + max_log_file_size(0), + log_file_time_to_roll(0), + keep_log_file_num(1000), + soft_rate_limit(0.0), + hard_rate_limit(0.0), + rate_limit_delay_max_milliseconds(1000), + max_manifest_file_size(std::numeric_limits::max()), + no_block_cache(false), + table_cache_numshardbits(4), + table_cache_remove_scan_count_limit(16), + arena_block_size(0), + disable_auto_compactions(false), + WAL_ttl_seconds(0), + WAL_size_limit_MB(0), + manifest_preallocation_size(4 * 1024 * 1024), + purge_redundant_kvs_while_flush(true), + allow_os_buffer(true), + allow_mmap_reads(false), + allow_mmap_writes(false), + is_fd_close_on_exec(true), + skip_log_error_on_recovery(false), + stats_dump_period_sec(3600), + block_size_deviation(10), + advise_random_on_open(true), + access_hint_on_compaction_start(NORMAL), + use_adaptive_mutex(false), + bytes_per_sync(0), + compaction_style(kCompactionStyleLevel), + verify_checksums_in_compaction(true), + filter_deletes(false), + max_sequential_skip_in_iterations(8), + memtable_factory(std::shared_ptr(new SkipListFactory)), + table_factory( + std::shared_ptr(new BlockBasedTableFactory())), + inplace_update_support(false), + inplace_update_num_locks(10000), + inplace_callback(nullptr), + memtable_prefix_bloom_bits(0), + memtable_prefix_bloom_probes(6), + bloom_locality(0), + max_successive_merges(0), + min_partial_merge_operands(2), + allow_thread_local(true) { + assert(memtable_factory.get() != nullptr); +} + +static const char* const access_hints[] = { + "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" +}; + +void +Options::Dump(Logger* log) const +{ + Log(log," Options.comparator: %s", comparator->Name()); + Log(log," Options.merge_operator: %s", + merge_operator? merge_operator->Name() : "None"); + Log(log," Options.compaction_filter: %s", + compaction_filter? compaction_filter->Name() : "None"); + Log(log," Options.compaction_filter_factory: %s", + compaction_filter_factory->Name()); + Log(log, " Options.compaction_filter_factory_v2: %s", + compaction_filter_factory_v2->Name()); + Log(log," Options.memtable_factory: %s", + memtable_factory->Name()); + Log(log," Options.table_factory: %s", table_factory->Name()); + Log(log," Options.error_if_exists: %d", error_if_exists); + Log(log," Options.create_if_missing: %d", create_if_missing); + Log(log," Options.paranoid_checks: %d", paranoid_checks); + Log(log," Options.env: %p", env); + Log(log," Options.info_log: %p", info_log.get()); + Log(log," Options.write_buffer_size: %zd", write_buffer_size); + Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number); + Log(log," Options.max_open_files: %d", max_open_files); + Log(log," Options.block_cache: %p", block_cache.get()); + Log(log," Options.block_cache_compressed: %p", + block_cache_compressed.get()); + if (block_cache) { + Log(log," Options.block_cache_size: %zd", + block_cache->GetCapacity()); + } + if (block_cache_compressed) { + Log(log,"Options.block_cache_compressed_size: %zd", + block_cache_compressed->GetCapacity()); + } + Log(log," Options.block_size: %zd", block_size); + Log(log," Options.block_restart_interval: %d", block_restart_interval); + if (!compression_per_level.empty()) { + for (unsigned int i = 0; i < compression_per_level.size(); i++) { + Log(log," Options.compression[%d]: %d", + i, compression_per_level[i]); + } + } else { + Log(log," Options.compression: %d", compression); + } + Log(log," Options.filter_policy: %s", + filter_policy == nullptr ? "nullptr" : filter_policy->Name()); + Log(log," Options.prefix_extractor: %s", + prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); + Log(log," Options.whole_key_filtering: %d", whole_key_filtering); + Log(log," Options.num_levels: %d", num_levels); + Log(log," Options.disableDataSync: %d", disableDataSync); + Log(log," Options.use_fsync: %d", use_fsync); + Log(log," Options.max_log_file_size: %zu", max_log_file_size); + Log(log,"Options.max_manifest_file_size: %lu", + (unsigned long)max_manifest_file_size); + Log(log," Options.log_file_time_to_roll: %zu", log_file_time_to_roll); + Log(log," Options.keep_log_file_num: %zu", keep_log_file_num); + Log(log," Options.db_stats_log_interval: %d", + db_stats_log_interval); + Log(log," Options.allow_os_buffer: %d", allow_os_buffer); + Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); + Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes); + Log(log," Options.min_write_buffer_number_to_merge: %d", + min_write_buffer_number_to_merge); + Log(log," Options.purge_redundant_kvs_while_flush: %d", + purge_redundant_kvs_while_flush); + Log(log," Options.compression_opts.window_bits: %d", + compression_opts.window_bits); + Log(log," Options.compression_opts.level: %d", + compression_opts.level); + Log(log," Options.compression_opts.strategy: %d", + compression_opts.strategy); + Log(log," Options.level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + Log(log," Options.level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + Log(log," Options.level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + Log(log," Options.max_mem_compaction_level: %d", + max_mem_compaction_level); + Log(log," Options.target_file_size_base: %d", + target_file_size_base); + Log(log," Options.target_file_size_multiplier: %d", + target_file_size_multiplier); + Log(log," Options.max_bytes_for_level_base: %lu", + (unsigned long)max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_multiplier: %d", + max_bytes_for_level_multiplier); + for (int i = 0; i < num_levels; i++) { + Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", + i, max_bytes_for_level_multiplier_additional[i]); + } + Log(log," Options.max_sequential_skip_in_iterations: %lu", + (unsigned long)max_sequential_skip_in_iterations); + Log(log," Options.expanded_compaction_factor: %d", + expanded_compaction_factor); + Log(log," Options.source_compaction_factor: %d", + source_compaction_factor); + Log(log," Options.max_grandparent_overlap_factor: %d", + max_grandparent_overlap_factor); + Log(log," Options.db_log_dir: %s", + db_log_dir.c_str()); + Log(log," Options.wal_dir: %s", + wal_dir.c_str()); + Log(log," Options.disable_seek_compaction: %d", + disable_seek_compaction); + Log(log," Options.no_block_cache: %d", + no_block_cache); + Log(log," Options.table_cache_numshardbits: %d", + table_cache_numshardbits); + Log(log," Options.table_cache_remove_scan_count_limit: %d", + table_cache_remove_scan_count_limit); + Log(log," Options.arena_block_size: %zu", + arena_block_size); + Log(log," Options.delete_obsolete_files_period_micros: %lu", + (unsigned long)delete_obsolete_files_period_micros); + Log(log," Options.max_background_compactions: %d", + max_background_compactions); + Log(log," Options.max_background_flushes: %d", + max_background_flushes); + Log(log," Options.soft_rate_limit: %.2f", + soft_rate_limit); + Log(log," Options.hard_rate_limit: %.2f", + hard_rate_limit); + Log(log," Options.rate_limit_delay_max_milliseconds: %u", + rate_limit_delay_max_milliseconds); + Log(log," Options.disable_auto_compactions: %d", + disable_auto_compactions); + Log(log," Options.WAL_ttl_seconds: %lu", + (unsigned long)WAL_ttl_seconds); + Log(log," Options.WAL_size_limit_MB: %lu", + (unsigned long)WAL_size_limit_MB); + Log(log," Options.manifest_preallocation_size: %zu", + manifest_preallocation_size); + Log(log," Options.purge_redundant_kvs_while_flush: %d", + purge_redundant_kvs_while_flush); + Log(log," Options.allow_os_buffer: %d", + allow_os_buffer); + Log(log," Options.allow_mmap_reads: %d", + allow_mmap_reads); + Log(log," Options.allow_mmap_writes: %d", + allow_mmap_writes); + Log(log," Options.is_fd_close_on_exec: %d", + is_fd_close_on_exec); + Log(log," Options.skip_log_error_on_recovery: %d", + skip_log_error_on_recovery); + Log(log," Options.stats_dump_period_sec: %u", + stats_dump_period_sec); + Log(log," Options.block_size_deviation: %d", + block_size_deviation); + Log(log," Options.advise_random_on_open: %d", + advise_random_on_open); + Log(log," Options.access_hint_on_compaction_start: %s", + access_hints[access_hint_on_compaction_start]); + Log(log," Options.use_adaptive_mutex: %d", + use_adaptive_mutex); + Log(log," Options.bytes_per_sync: %lu", + (unsigned long)bytes_per_sync); + Log(log," Options.filter_deletes: %d", + filter_deletes); + Log(log, " Options.verify_checksums_in_compaction: %d", + verify_checksums_in_compaction); + Log(log," Options.compaction_style: %d", + compaction_style); + Log(log," Options.compaction_options_universal.size_ratio: %u", + compaction_options_universal.size_ratio); + Log(log,"Options.compaction_options_universal.min_merge_width: %u", + compaction_options_universal.min_merge_width); + Log(log,"Options.compaction_options_universal.max_merge_width: %u", + compaction_options_universal.max_merge_width); + Log(log,"Options.compaction_options_universal." + "max_size_amplification_percent: %u", + compaction_options_universal.max_size_amplification_percent); + Log(log, + "Options.compaction_options_universal.compression_size_percent: %u", + compaction_options_universal.compression_size_percent); + std::string collector_names; + for (auto collector : table_properties_collectors) { + collector_names.append(collector->Name()); + collector_names.append("; "); + } + Log(log, " Options.table_properties_collectors: %s", + collector_names.c_str()); + Log(log, " Options.inplace_update_support: %d", + inplace_update_support); + Log(log, " Options.inplace_update_num_locks: %zd", + inplace_update_num_locks); + Log(log, " Options.min_partial_merge_operands: %u", + min_partial_merge_operands); + // TODO: easier config for bloom (maybe based on avg key/value size) + Log(log, " Options.memtable_prefix_bloom_bits: %d", + memtable_prefix_bloom_bits); + Log(log, " Options.memtable_prefix_bloom_probes: %d", + memtable_prefix_bloom_probes); + Log(log, " Options.max_successive_merges: %zd", + max_successive_merges); +} // Options::Dump + +// +// The goal of this method is to create a configuration that +// allows an application to write all files into L0 and +// then do a single compaction to output all files into L1. +Options* +Options::PrepareForBulkLoad() +{ + // never slowdown ingest. + level0_file_num_compaction_trigger = (1<<30); + level0_slowdown_writes_trigger = (1<<30); + level0_stop_writes_trigger = (1<<30); + + // no auto compactions please. The application should issue a + // manual compaction after all data is loaded into L0. + disable_auto_compactions = true; + disable_seek_compaction = true; + disableDataSync = true; + + // A manual compaction run should pick all files in L0 in + // a single compaction run. + source_compaction_factor = (1<<30); + + // It is better to have only 2 levels, otherwise a manual + // compaction would compact at every possible level, thereby + // increasing the total time needed for compactions. + num_levels = 2; + + // Prevent a memtable flush to automatically promote files + // to L1. This is helpful so that all files that are + // input to the manual compaction are all at L0. + max_background_compactions = 2; + + // The compaction would create large files in L1. + target_file_size_base = 256 * 1024 * 1024; + return this; +} + +} // namespace rocksdb diff --git a/util/perf_context.cc b/util/perf_context.cc new file mode 100644 index 00000000..855e7c45 --- /dev/null +++ b/util/perf_context.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include +#include "util/perf_context_imp.h" + +namespace rocksdb { + +// by default, enable counts only +#if defined(IOS_CROSS_COMPILE) +PerfLevel perf_level = kEnableCount; +#else +__thread PerfLevel perf_level = kEnableCount; +#endif + +void SetPerfLevel(PerfLevel level) { perf_level = level; } + +void PerfContext::Reset() { + user_key_comparison_count = 0; + block_cache_hit_count = 0; + block_read_count = 0; + block_read_byte = 0; + block_read_time = 0; + block_checksum_time = 0; + block_decompress_time = 0; + internal_key_skipped_count = 0; + internal_delete_skipped_count = 0; + write_wal_time = 0; + + get_snapshot_time = 0; + get_from_memtable_time = 0; + get_from_memtable_count = 0; + get_post_process_time = 0; + get_from_output_files_time = 0; + seek_child_seek_time = 0; + seek_child_seek_count = 0; + seek_min_heap_time = 0; + seek_internal_seek_time = 0; + find_next_user_entry_time = 0; + write_pre_and_post_process_time = 0; + write_memtable_time = 0; +} + +#define OUTPUT(counter) #counter << " = " << counter << ", " + +std::string PerfContext::ToString() const { + std::ostringstream ss; + ss << OUTPUT(user_key_comparison_count) + << OUTPUT(block_cache_hit_count) + << OUTPUT(block_read_count) + << OUTPUT(block_read_byte) + << OUTPUT(block_read_time) + << OUTPUT(block_checksum_time) + << OUTPUT(block_decompress_time) + << OUTPUT(internal_key_skipped_count) + << OUTPUT(internal_delete_skipped_count) + << OUTPUT(write_wal_time) + << OUTPUT(get_snapshot_time) + << OUTPUT(get_from_memtable_time) + << OUTPUT(get_from_memtable_count) + << OUTPUT(get_post_process_time) + << OUTPUT(get_from_output_files_time) + << OUTPUT(seek_child_seek_time) + << OUTPUT(seek_child_seek_count) + << OUTPUT(seek_min_heap_time) + << OUTPUT(seek_internal_seek_time) + << OUTPUT(find_next_user_entry_time) + << OUTPUT(write_pre_and_post_process_time) + << OUTPUT(write_memtable_time); + return ss.str(); +} + +#if defined(IOS_CROSS_COMPILE) +PerfContext perf_context; +#else +__thread PerfContext perf_context; +#endif +} diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h new file mode 100644 index 00000000..ac044ca0 --- /dev/null +++ b/util/perf_context_imp.h @@ -0,0 +1,40 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/perf_context.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +// TODO(icanadi): when calling perf_context is macro-ed (TODO ljin), make it +// noop in case IOS_CROSS_COMPILE +#if defined(IOS_CROSS_COMPILE) +extern enum PerfLevel perf_level; +#else +extern __thread PerfLevel perf_level; +#endif + +inline void StartPerfTimer(StopWatchNano* timer) { + if (perf_level >= PerfLevel::kEnableTime) { + timer->Start(); + } +} + +inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) { + if (perf_level >= PerfLevel::kEnableCount) { + *count += delta; + } +} + +inline void BumpPerfTime(uint64_t* time, + StopWatchNano* timer, + bool reset = true) { + if (perf_level >= PerfLevel::kEnableTime) { + *time += timer->ElapsedNanos(reset); + } +} + +} diff --git a/util/posix_logger.h b/util/posix_logger.h new file mode 100644 index 00000000..a1086973 --- /dev/null +++ b/util/posix_logger.h @@ -0,0 +1,161 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#endif +#include "rocksdb/env.h" +#include + +namespace rocksdb { + +const int kDebugLogChunkSize = 128 * 1024; + +class PosixLogger : public Logger { + private: + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + int fd_; + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + bool flush_pending_; + public: + PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env, + const InfoLogLevel log_level = InfoLogLevel::ERROR) + : Logger(log_level), + file_(f), + gettid_(gettid), + log_size_(0), + fd_(fileno(f)), + last_flush_micros_(0), + env_(env), + flush_pending_(false) {} + virtual ~PosixLogger() { + fclose(file_); + } + virtual void Flush() { + if (flush_pending_) { + flush_pending_ = false; + fflush(file_); + } + last_flush_micros_ = env_->NowMicros(); + } + virtual void Logv(const char* format, va_list ap) { + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + +#ifdef ROCKSDB_FALLOCATE_PRESENT + // If this write would cross a boundary of kDebugLogChunkSize + // space, pre-allocate more space to avoid overly large + // allocations from filesystem allocsize options. + const size_t log_size = log_size_; + const int last_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize); + const int desired_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size + write_size) / + kDebugLogChunkSize); + if (last_allocation_chunk != desired_allocation_chunk) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0, + desired_allocation_chunk * kDebugLogChunkSize); + } +#endif + + size_t sz = fwrite(base, 1, write_size, file_); + flush_pending_ = true; + assert(sz == write_size); + if (sz > 0) { + log_size_ += write_size; + } + uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + fflush(file_); + last_flush_micros_ = now_micros; + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const { + return log_size_; + } +}; + +} // namespace rocksdb diff --git a/util/random.h b/util/random.h new file mode 100644 index 00000000..e5b33150 --- /dev/null +++ b/util/random.h @@ -0,0 +1,90 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +namespace rocksdb { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { } + uint32_t Next() { + static const uint32_t M = 2147483647L; // 2^31-1 + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +// A simple 64bit random number generator based on std::mt19937_64 +class Random64 { + private: + std::mt19937_64 generator_; + + public: + explicit Random64(uint64_t s) : generator_(s) { } + + // Generates the next random number + uint64_t Next() { return generator_(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform(uint64_t n) { + return std::uniform_int_distribution(0, n - 1)(generator_); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint64_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint64_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } +}; + +} // namespace rocksdb diff --git a/util/signal_test.cc b/util/signal_test.cc new file mode 100644 index 00000000..bffc298d --- /dev/null +++ b/util/signal_test.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/stack_trace.h" +#include + +void f0() { + char *p = nullptr; + *p = 10; /* SIGSEGV here!! */ +} + +void f1() { + f0(); +} + +void f2() { + f1(); +} + +void f3() { + f2(); +} + +int main() { + rocksdb::InstallStackTraceHandler(); + + f3(); + + return 0; +} diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc new file mode 100644 index 00000000..93f7134c --- /dev/null +++ b/util/skiplistrep.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/memtablerep.h" +#include "db/memtable.h" +#include "db/skiplist.h" + +namespace rocksdb { +namespace { +class SkipListRep : public MemTableRep { + SkipList skip_list_; +public: + explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena) + : MemTableRep(arena), skip_list_(compare, arena) { + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + virtual void Insert(KeyHandle handle) override { + skip_list_.Insert(static_cast(handle)); + } + + // Returns true iff an entry that compares equal to key is in the list. + virtual bool Contains(const char* key) const override { + return skip_list_.Contains(key); + } + + virtual size_t ApproximateMemoryUsage() override { + // All memory is allocated through arena; nothing to report here + return 0; + } + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + for (iter.Seek(dummy_slice, k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } + + virtual ~SkipListRep() override { } + + // Iteration over the contents of a skip list + class Iterator : public MemTableRep::Iterator { + SkipList::Iterator iter_; + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator( + const SkipList* list + ) : iter_(list) { } + + virtual ~Iterator() override { } + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override { + return iter_.Valid(); + } + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override { + return iter_.key(); + } + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override { + iter_.Next(); + } + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override { + iter_.Prev(); + } + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& user_key, const char* memtable_key) + override { + if (memtable_key != nullptr) { + iter_.Seek(memtable_key); + } else { + iter_.Seek(EncodeKey(&tmp_, user_key)); + } + } + + // Position at the first entry in list. + // Final state of iterator is Valid() iff list is not empty. + virtual void SeekToFirst() override { + iter_.SeekToFirst(); + } + + // Position at the last entry in list. + // Final state of iterator is Valid() iff list is not empty. + virtual void SeekToLast() override { + iter_.SeekToLast(); + } + protected: + std::string tmp_; // For passing to EncodeKey + }; + + // Unhide default implementations of GetIterator + using MemTableRep::GetIterator; + + virtual MemTableRep::Iterator* GetIterator() override { + return new SkipListRep::Iterator(&skip_list_); + } +}; +} + +MemTableRep* SkipListFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform*) { + return new SkipListRep(compare, arena); +} + +} // namespace rocksdb diff --git a/util/slice.cc b/util/slice.cc new file mode 100644 index 00000000..55f561f0 --- /dev/null +++ b/util/slice.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { } + + virtual const char* Name() const { + return "rocksdb.FixedPrefix"; + } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + virtual bool InDomain(const Slice& src) const { + return (src.size() >= prefix_len_); + } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() == prefix_len_); + } +}; + +class NoopTransform : public SliceTransform { + public: + explicit NoopTransform() { } + + virtual const char* Name() const { + return "rocksdb.Noop"; + } + + virtual Slice Transform(const Slice& src) const { + return src; + } + + virtual bool InDomain(const Slice& src) const { + return true; + } + + virtual bool InRange(const Slice& dst) const { + return true; + } +}; + +} + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +const SliceTransform* NewNoopTransform() { + return new NoopTransform; +} + +} // namespace rocksdb diff --git a/util/stack_trace.h b/util/stack_trace.h new file mode 100644 index 00000000..3b06e1df --- /dev/null +++ b/util/stack_trace.h @@ -0,0 +1,17 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +namespace rocksdb { + +// Install a signal handler to print callstack on the following signals: +// SIGILL SIGSEGV SIGBUS SIGABRT +// Currently supports linux only. No-op otherwise. +void InstallStackTraceHandler(); + +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + +} // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc new file mode 100644 index 00000000..4fc24001 --- /dev/null +++ b/util/statistics.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "util/statistics.h" +#include "rocksdb/statistics.h" +#include +#include + +namespace rocksdb { + +std::shared_ptr CreateDBStatistics() { + return std::make_shared(); +} + +StatisticsImpl::StatisticsImpl() {} + +StatisticsImpl::~StatisticsImpl() {} + +long StatisticsImpl::getTickerCount(Tickers tickerType) { + assert(tickerType < TICKER_ENUM_MAX); + return tickers_[tickerType].value; +} + +void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + tickers_[tickerType].value = count; +} + +void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + tickers_[tickerType].value += count; +} + +void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].Add(value); +} + +void StatisticsImpl::histogramData(Histograms histogramType, + HistogramData* const data) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].Data(data); +} + +namespace { + +// a buffer size used for temp string buffers +const int kBufferSize = 200; + +std::string HistogramToString ( + Statistics* dbstats, + const Histograms& histogram_type, + const std::string& name) { + + char buffer[kBufferSize]; + HistogramData histogramData; + dbstats->histogramData(histogram_type, &histogramData); + snprintf( + buffer, + kBufferSize, + "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n", + name.c_str(), + histogramData.median, + histogramData.percentile95, + histogramData.percentile99 + ); + return std::string(buffer); +}; + +std::string TickerToString(Statistics* dbstats, const Tickers& ticker, + const std::string& name) { + char buffer[kBufferSize]; + snprintf(buffer, kBufferSize, "%s COUNT : %ld\n", + name.c_str(), dbstats->getTickerCount(ticker)); + return std::string(buffer); +}; +} // namespace + +std::string Statistics::ToString() { + std::string res; + res.reserve(20000); + for (const auto& t : TickersNameMap) { + res.append(TickerToString(this, t.first, t.second)); + } + for (const auto& h : HistogramsNameMap) { + res.append(HistogramToString(this, h.first, h.second)); + } + res.shrink_to_fit(); + return res; +} + +} // namespace rocksdb diff --git a/util/statistics.h b/util/statistics.h new file mode 100644 index 00000000..d57a1dd4 --- /dev/null +++ b/util/statistics.h @@ -0,0 +1,66 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/statistics.h" +#include "util/histogram.h" +#include "util/mutexlock.h" +#include "port/likely.h" + +#include +#include + + +namespace rocksdb { + +class StatisticsImpl : public Statistics { + public: + StatisticsImpl(); + virtual ~StatisticsImpl(); + + virtual long getTickerCount(Tickers tickerType); + virtual void setTickerCount(Tickers tickerType, uint64_t count); + virtual void recordTick(Tickers tickerType, uint64_t count); + virtual void measureTime(Histograms histogramType, uint64_t value); + virtual void histogramData(Histograms histogramType, + HistogramData* const data); + + private: + struct Ticker { + Ticker() : value(uint_fast64_t()) {} + + std::atomic_uint_fast64_t value; + // Pad the structure to make it size of 64 bytes. A plain array of + // std::atomic_uint_fast64_t results in huge performance degradataion + // due to false sharing. + char padding[64 - sizeof(std::atomic_uint_fast64_t)]; + }; + + Ticker tickers_[TICKER_ENUM_MAX] __attribute__((aligned(64))); + HistogramImpl histograms_[HISTOGRAM_ENUM_MAX] __attribute__((aligned(64))); +}; + +// Utility functions +inline void MeasureTime(Statistics* statistics, Histograms histogramType, + uint64_t value) { + if (statistics) { + statistics->measureTime(histogramType, value); + } +} + +inline void RecordTick(Statistics* statistics, Tickers ticker, + uint64_t count = 1) { + if (statistics) { + statistics->recordTick(ticker, count); + } +} + +inline void SetTickerCount(Statistics* statistics, Tickers ticker, + uint64_t count) { + if (statistics) { + statistics->setTickerCount(ticker, count); + } +} +} diff --git a/util/stats_logger.h b/util/stats_logger.h new file mode 100644 index 00000000..f0b45404 --- /dev/null +++ b/util/stats_logger.h @@ -0,0 +1,26 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +namespace rocksdb { + +class StatsLogger { + + public: + + virtual void Log_Deploy_Stats(const std::string& db_version, + const std::string& machine_info, + const std::string& data_dir, + const uint64_t data_size, + const uint32_t file_number, + const std::string& data_size_per_level, + const std::string& file_number_per_level, + const int64_t& ts_unix) = 0; + virtual ~StatsLogger() {} + +}; + +} diff --git a/util/status.cc b/util/status.cc new file mode 100644 index 00000000..2a5f05a4 --- /dev/null +++ b/util/status.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "port/port.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 4]; + memcpy(result, state, size + 4); + return result; +} + +Status::Status(Code code, const Slice& msg, const Slice& msg2) : + code_(code) { + assert(code != kOk); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + char* result = new char[size + 4]; + memcpy(result, &size, sizeof(size)); + memcpy(result + 4, msg.data(), len1); + if (len2) { + result[4 + len1] = ':'; + result[5 + len1] = ' '; + memcpy(result + 6 + len1, msg2.data(), len2); + } + state_ = result; +} + +std::string Status::ToString() const { + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge in progress: "; + break; + case kIncomplete: + type = "Result incomplete: "; + break; + case kShutdownInProgress: + type = "Shutdown in progress: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(code())); + type = tmp; + break; + } + std::string result(type); + if (state_ != nullptr) { + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(state_ + 4, length); + } + return result; +} + +} // namespace rocksdb diff --git a/util/stl_wrappers.h b/util/stl_wrappers.h new file mode 100644 index 00000000..b4c14b4b --- /dev/null +++ b/util/stl_wrappers.h @@ -0,0 +1,32 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#include "util/murmurhash.h" +#include "util/coding.h" + +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" + +namespace rocksdb { +namespace stl_wrappers { + class Base { + protected: + const MemTableRep::KeyComparator& compare_; + explicit Base(const MemTableRep::KeyComparator& compare) + : compare_(compare) { } + }; + + struct Compare : private Base { + explicit Compare(const MemTableRep::KeyComparator& compare) + : Base(compare) { } + inline bool operator()(const char* a, const char* b) const { + return compare_(a, b) < 0; + } + }; + +} +} diff --git a/util/stop_watch.h b/util/stop_watch.h new file mode 100644 index 00000000..48e1b01c --- /dev/null +++ b/util/stop_watch.h @@ -0,0 +1,67 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/env.h" +#include "util/statistics.h" + +namespace rocksdb { +// Auto-scoped. +// Records the statistic into the corresponding histogram. +class StopWatch { + public: + explicit StopWatch( + Env * const env, + Statistics* statistics = nullptr, + const Histograms histogram_name = DB_GET, + bool auto_start = true) : + env_(env), + start_time_((!auto_start && !statistics) ? 0 : env->NowMicros()), + statistics_(statistics), + histogram_name_(histogram_name) {} + + + + uint64_t ElapsedMicros() { + return env_->NowMicros() - start_time_; + } + + ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); } + + private: + Env* const env_; + const uint64_t start_time_; + Statistics* statistics_; + const Histograms histogram_name_; + +}; + +// a nano second precision stopwatch +class StopWatchNano { + public: + explicit StopWatchNano(Env* const env, bool auto_start = false) + : env_(env), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = env_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = env_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + private: + Env* const env_; + uint64_t start_; +}; + +} // namespace rocksdb diff --git a/util/string_util.cc b/util/string_util.cc new file mode 100644 index 00000000..33f84d9b --- /dev/null +++ b/util/string_util.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include + +namespace rocksdb { + +using namespace std; +using std::string; +using std::vector; +using std::stringstream; + +vector stringSplit(string arg, char delim) { + vector splits; + stringstream ss(arg); + string item; + while(getline(ss, item, delim)) { + splits.push_back(item); + } + return splits; +} +} diff --git a/util/string_util.h b/util/string_util.h new file mode 100644 index 00000000..7dfd68aa --- /dev/null +++ b/util/string_util.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +namespace rocksdb { + +extern std::vector stringSplit(std::string arg, char delim); + +} diff --git a/util/sync_point.cc b/util/sync_point.cc new file mode 100644 index 00000000..5d0ac2dd --- /dev/null +++ b/util/sync_point.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/sync_point.h" + +namespace rocksdb { + +SyncPoint* SyncPoint::GetInstance() { + static SyncPoint sync_point; + return &sync_point; +} + +void SyncPoint::LoadDependency(const std::vector& dependencies) { + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } +} + +bool SyncPoint::PredecessorsAllCleared(const std::string& point) { + for (const auto& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::EnableProcessing() { + std::unique_lock lock(mutex_); + enabled_ = true; +} + +void SyncPoint::DisableProcessing() { + std::unique_lock lock(mutex_); + enabled_ = false; +} + +void SyncPoint::ClearTrace() { + std::unique_lock lock(mutex_); + cleared_points_.clear(); +} + +void SyncPoint::Process(const std::string& point) { + std::unique_lock lock(mutex_); + + if (!enabled_) return; + + while (!PredecessorsAllCleared(point)) { + cv_.wait(lock); + } + + cleared_points_.insert(point); + cv_.notify_all(); +} + +} // namespace rocksdb diff --git a/util/sync_point.h b/util/sync_point.h new file mode 100644 index 00000000..3cc89237 --- /dev/null +++ b/util/sync_point.h @@ -0,0 +1,79 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +// This class provides facility to reproduce race conditions deterministically +// in unit tests. +// Developer could specify sync points in the codebase via TEST_SYNC_POINT. +// Each sync point represents a position in the execution stream of a thread. +// In the unit test, 'Happens After' relationship among sync points could be +// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of +// threads execution. +// Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case. + +class SyncPoint { + public: + static SyncPoint* GetInstance(); + + struct Dependency { + std::string predecessor; + std::string successor; + }; + // call once at the beginning of a test to setup the dependency between + // sync points + void LoadDependency(const std::vector& dependencies); + + // enable sync point processing (disabled on startup) + void EnableProcessing(); + + // disable sync point processing + void DisableProcessing(); + + // remove the execution trace of all sync points + void ClearTrace(); + + // triggered by TEST_SYNC_POINT, blocking execution until all predecessors + // are executed. + void Process(const std::string& point); + + // TODO: it might be useful to provide a function that blocks until all + // sync points are cleared. + + private: + bool PredecessorsAllCleared(const std::string& point); + + // successor/predecessor map loaded from LoadDependency + std::unordered_map> successors_; + std::unordered_map> predecessors_; + + std::mutex mutex_; + std::condition_variable cv_; + // sync points that have been passed through + std::unordered_set cleared_points_; + bool enabled_ = false; +}; + +} // namespace rocksdb + +// Use TEST_SYNC_POINT to specify sync points inside code base. +// Sync points can have happens-after depedency on other sync points, +// configured at runtime via SyncPoint::LoadDependency. This could be +// utilized to re-produce race conditions between threads. +// See TransactionLogIteratorRace in db_test.cc for an example use case. +// TEST_SYNC_POINT is no op in release build. +#ifdef NDEBUG +#define TEST_SYNC_POINT(x) +#else +#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x) +#endif diff --git a/util/testharness.cc b/util/testharness.cc new file mode 100644 index 00000000..85716cda --- /dev/null +++ b/util/testharness.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testharness.h" + +#include +#include +#include +#include + +namespace rocksdb { +namespace test { + +namespace { +struct Test { + const char* base; + const char* name; + void (*func)(); +}; +std::vector* tests; +} + +bool RegisterTest(const char* base, const char* name, void (*func)()) { + if (tests == nullptr) { + tests = new std::vector; + } + Test t; + t.base = base; + t.name = name; + t.func = func; + tests->push_back(t); + return true; +} + +int RunAllTests() { + const char* matcher = getenv("ROCKSDB_TESTS"); + + int num = 0; + if (tests != nullptr) { + for (unsigned int i = 0; i < tests->size(); i++) { + const Test& t = (*tests)[i]; + if (matcher != nullptr) { + std::string name = t.base; + name.push_back('.'); + name.append(t.name); + if (strstr(name.c_str(), matcher) == nullptr) { + continue; + } + } + fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); + (*t.func)(); + ++num; + } + } + fprintf(stderr, "==== PASSED %d tests\n", num); + return 0; +} + +std::string TmpDir() { + std::string dir; + Status s = Env::Default()->GetTestDirectory(&dir); + ASSERT_TRUE(s.ok()) << s.ToString(); + return dir; +} + +int RandomSeed() { + const char* env = getenv("TEST_RANDOM_SEED"); + int result = (env != nullptr ? atoi(env) : 301); + if (result <= 0) { + result = 301; + } + return result; +} + +} // namespace test +} // namespace rocksdb diff --git a/util/testharness.h b/util/testharness.h new file mode 100644 index 00000000..f1591781 --- /dev/null +++ b/util/testharness.h @@ -0,0 +1,142 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/random.h" +#include "util/stack_trace.h" + +namespace rocksdb { +namespace test { + +// Run some of the tests registered by the TEST() macro. If the +// environment variable "ROCKSDB_TESTS" is not set, runs all tests. +// Otherwise, runs only the tests whose name contains the value of +// "ROCKSDB_TESTS" as a substring. E.g., suppose the tests are: +// TEST(Foo, Hello) { ... } +// TEST(Foo, World) { ... } +// ROCKSDB_TESTS=Hello will run the first test +// ROCKSDB_TESTS=o will run both tests +// ROCKSDB_TESTS=Junk will run no tests +// +// Returns 0 if all tests pass. +// Dies or returns a non-zero value if some test fails. +extern int RunAllTests(); + +// Return the directory to use for temporary storage. +extern std::string TmpDir(); + +// Return a randomization seed for this run. Typically returns the +// same number on repeated invocations of this binary, but automated +// runs may be able to vary the seed. +extern int RandomSeed(); + +// An instance of Tester is allocated to hold temporary state during +// the execution of an assertion. +class Tester { + private: + bool ok_; + const char* fname_; + int line_; + std::stringstream ss_; + + public: + Tester(const char* f, int l) + : ok_(true), fname_(f), line_(l) { + } + + ~Tester() { + if (!ok_) { + fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str()); + PrintStack(2); + exit(1); + } + } + + Tester& Is(bool b, const char* msg) { + if (!b) { + ss_ << " Assertion failure " << msg; + ok_ = false; + } + return *this; + } + + Tester& IsOk(const Status& s) { + if (!s.ok()) { + ss_ << " " << s.ToString(); + ok_ = false; + } + return *this; + } + +#define BINARY_OP(name,op) \ + template \ + Tester& name(const X& x, const Y& y) { \ + if (! (x op y)) { \ + ss_ << " failed: " << x << (" " #op " ") << y; \ + ok_ = false; \ + } \ + return *this; \ + } + + BINARY_OP(IsEq, ==) + BINARY_OP(IsNe, !=) + BINARY_OP(IsGe, >=) + BINARY_OP(IsGt, >) + BINARY_OP(IsLe, <=) + BINARY_OP(IsLt, <) +#undef BINARY_OP + + // Attach the specified value to the error message if an error has occurred + template + Tester& operator<<(const V& value) { + if (!ok_) { + ss_ << " " << value; + } + return *this; + } +}; + +#define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c) +#define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) +#define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) +#define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) +#define ASSERT_GT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGt((a),(b)) +#define ASSERT_LE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLe((a),(b)) +#define ASSERT_LT(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsLt((a),(b)) + +#define TCONCAT(a,b) TCONCAT1(a,b) +#define TCONCAT1(a,b) a##b + +#define TEST(base,name) \ +class TCONCAT(_Test_,name) : public base { \ + public: \ + void _Run(); \ + static void _RunIt() { \ + TCONCAT(_Test_,name) t; \ + t._Run(); \ + } \ +}; \ +bool TCONCAT(_Test_ignored_,name) = \ + ::rocksdb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \ +void TCONCAT(_Test_,name)::_Run() + +// Register the specified test. Typically not used directly, but +// invoked via the macro expansion of TEST. +extern bool RegisterTest(const char* base, const char* name, void (*func)()); + + +} // namespace test +} // namespace rocksdb diff --git a/util/testutil.cc b/util/testutil.cc new file mode 100644 index 00000000..13e781e6 --- /dev/null +++ b/util/testutil.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/testutil.h" + +#include "util/random.h" + +namespace rocksdb { +namespace test { + +Slice RandomString(Random* rnd, int len, std::string* dst) { + dst->resize(len); + for (int i = 0; i < len; i++) { + (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' + } + return Slice(*dst); +} + +std::string RandomKey(Random* rnd, int len) { + // Make sure to generate a wide variety of characters so we + // test the boundary conditions for short-key optimizations. + static const char kTestChars[] = { + '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff' + }; + std::string result; + for (int i = 0; i < len; i++) { + result += kTestChars[rnd->Uniform(sizeof(kTestChars))]; + } + return result; +} + + +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data; + RandomString(rnd, raw, &raw_data); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < (unsigned int)len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + +} // namespace test +} // namespace rocksdb diff --git a/util/testutil.h b/util/testutil.h new file mode 100644 index 00000000..4fc8c0f5 --- /dev/null +++ b/util/testutil.h @@ -0,0 +1,80 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/random.h" + +namespace rocksdb { +namespace test { + +// Store in *dst a random string of length "len" and return a Slice that +// references the generated data. +extern Slice RandomString(Random* rnd, int len, std::string* dst); + +// Return a random key with the specified length that may contain interesting +// characters (e.g. \x00, \xff, etc.). +extern std::string RandomKey(Random* rnd, int len); + +// Store in *dst a string of length "len" that will compress to +// "N*compressed_fraction" bytes and return a Slice that references +// the generated data. +extern Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst); + +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + + ErrorEnv() : EnvWrapper(Env::Default()), + writable_file_error_(false), + num_writable_file_errors_(0) { } + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result, soptions); + } +}; + +// An internal comparator that just forward comparing results from the +// user comparator in it. Can be used to test entities that have no dependency +// on internal key structure but consumes InternalKeyComparator, like +// BlockBasedTable. +class PlainInternalKeyComparator : public InternalKeyComparator { + public: + explicit PlainInternalKeyComparator(const Comparator* c) + : InternalKeyComparator(c) {} + + virtual ~PlainInternalKeyComparator() {} + + virtual int Compare(const Slice& a, const Slice& b) const override { + return user_comparator()->Compare(a, b); + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + user_comparator()->FindShortestSeparator(start, limit); + } + virtual void FindShortSuccessor(std::string* key) const override { + user_comparator()->FindShortSuccessor(key); + } +}; + +} // namespace test +} // namespace rocksdb diff --git a/util/thread_local.cc b/util/thread_local.cc new file mode 100644 index 00000000..1b4220b8 --- /dev/null +++ b/util/thread_local.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/thread_local.h" +#include "util/mutexlock.h" +#include "port/likely.h" + + +namespace rocksdb { + +std::unique_ptr ThreadLocalPtr::StaticMeta::inst_; +port::Mutex ThreadLocalPtr::StaticMeta::mutex_; +#if !defined(OS_MACOSX) +__thread ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; +#endif + +ThreadLocalPtr::StaticMeta* ThreadLocalPtr::StaticMeta::Instance() { + if (UNLIKELY(inst_ == nullptr)) { + MutexLock l(&mutex_); + if (inst_ == nullptr) { + inst_.reset(new StaticMeta()); + } + } + return inst_.get(); +} + +void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { + auto* tls = static_cast(ptr); + assert(tls != nullptr); + + auto* inst = Instance(); + pthread_setspecific(inst->pthread_key_, nullptr); + + MutexLock l(&mutex_); + inst->RemoveThreadData(tls); + // Unref stored pointers of current thread from all instances + uint32_t id = 0; + for (auto& e : tls->entries) { + void* raw = e.ptr.load(std::memory_order_relaxed); + if (raw != nullptr) { + auto unref = inst->GetHandler(id); + if (unref != nullptr) { + unref(raw); + } + } + ++id; + } + // Delete thread local structure no matter if it is Mac platform + delete tls; +} + +ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) { + if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { + throw std::runtime_error("pthread_key_create failed"); + } + head_.next = &head_; + head_.prev = &head_; +} + +void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadLocalPtr::ThreadData* d) { + mutex_.AssertHeld(); + d->next = &head_; + d->prev = head_.prev; + head_.prev->next = d; + head_.prev = d; +} + +void ThreadLocalPtr::StaticMeta::RemoveThreadData( + ThreadLocalPtr::ThreadData* d) { + mutex_.AssertHeld(); + d->next->prev = d->prev; + d->prev->next = d->next; + d->next = d->prev = d; +} + +ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { +#if defined(OS_MACOSX) + // Make this local variable name look like a member variable so that we + // can share all the code below + ThreadData* tls_ = + static_cast(pthread_getspecific(Instance()->pthread_key_)); +#endif + + if (UNLIKELY(tls_ == nullptr)) { + auto* inst = Instance(); + tls_ = new ThreadData(); + { + // Register it in the global chain, needs to be done before thread exit + // handler registration + MutexLock l(&mutex_); + inst->AddThreadData(tls_); + } + // Even it is not OS_MACOSX, need to register value for pthread_key_ so that + // its exit handler will be triggered. + if (pthread_setspecific(inst->pthread_key_, tls_) != 0) { + { + MutexLock l(&mutex_); + inst->RemoveThreadData(tls_); + } + delete tls_; + throw std::runtime_error("pthread_setspecific failed"); + } + } + return tls_; +} + +void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + return nullptr; + } + return tls->entries[id].ptr.load(std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(&mutex_); + tls->entries.resize(id + 1); + } + tls->entries[id].ptr.store(ptr, std::memory_order_relaxed); +} + +void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(&mutex_); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.exchange(ptr, std::memory_order_relaxed); +} + +bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, + void*& expected) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(&mutex_); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.compare_exchange_strong(expected, ptr, + std::memory_order_relaxed, std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, + void* const replacement) { + MutexLock l(&mutex_); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(replacement, std::memory_order_relaxed); + if (ptr != nullptr) { + ptrs->push_back(ptr); + } + } + } +} + +void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) { + MutexLock l(&mutex_); + handler_map_[id] = handler; +} + +UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) { + mutex_.AssertHeld(); + auto iter = handler_map_.find(id); + if (iter == handler_map_.end()) { + return nullptr; + } + return iter->second; +} + +uint32_t ThreadLocalPtr::StaticMeta::GetId() { + MutexLock l(&mutex_); + if (free_instance_ids_.empty()) { + return next_instance_id_++; + } + + uint32_t id = free_instance_ids_.back(); + free_instance_ids_.pop_back(); + return id; +} + +uint32_t ThreadLocalPtr::StaticMeta::PeekId() const { + MutexLock l(&mutex_); + if (!free_instance_ids_.empty()) { + return free_instance_ids_.back(); + } + return next_instance_id_; +} + +void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { + // This id is not used, go through all thread local data and release + // corresponding value + MutexLock l(&mutex_); + auto unref = GetHandler(id); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(nullptr, std::memory_order_relaxed); + if (ptr != nullptr && unref != nullptr) { + unref(ptr); + } + } + } + handler_map_[id] = nullptr; + free_instance_ids_.push_back(id); +} + +ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) + : id_(StaticMeta::Instance()->GetId()) { + if (handler != nullptr) { + StaticMeta::Instance()->SetHandler(id_, handler); + } +} + +ThreadLocalPtr::~ThreadLocalPtr() { + StaticMeta::Instance()->ReclaimId(id_); +} + +void* ThreadLocalPtr::Get() const { + return StaticMeta::Instance()->Get(id_); +} + +void ThreadLocalPtr::Reset(void* ptr) { + StaticMeta::Instance()->Reset(id_, ptr); +} + +void* ThreadLocalPtr::Swap(void* ptr) { + return StaticMeta::Instance()->Swap(id_, ptr); +} + +bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { + return StaticMeta::Instance()->CompareAndSwap(id_, ptr, expected); +} + +void ThreadLocalPtr::Scrape(autovector* ptrs, void* const replacement) { + StaticMeta::Instance()->Scrape(id_, ptrs, replacement); +} + +} // namespace rocksdb diff --git a/util/thread_local.h b/util/thread_local.h new file mode 100644 index 00000000..d1434e3e --- /dev/null +++ b/util/thread_local.h @@ -0,0 +1,168 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "util/autovector.h" +#include "port/port_posix.h" + +namespace rocksdb { + +// Cleanup function that will be called for a stored thread local +// pointer (if not NULL) when one of the following happens: +// (1) a thread terminates +// (2) a ThreadLocalPtr is destroyed +typedef void (*UnrefHandler)(void* ptr); + +// Thread local storage that only stores value of pointer type. The storage +// distinguish data coming from different thread and different ThreadLocalPtr +// instances. For example, if a regular thread_local variable A is declared +// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids +// the confliction. The total storage size equals to # of threads * # of +// ThreadLocalPtr instances. It is not efficient in terms of space, but it +// should serve most of our use cases well and keep code simple. +class ThreadLocalPtr { + public: + explicit ThreadLocalPtr(UnrefHandler handler = nullptr); + + ~ThreadLocalPtr(); + + // Return the current pointer stored in thread local + void* Get() const; + + // Set a new pointer value to the thread local storage. + void Reset(void* ptr); + + // Atomically swap the supplied ptr and return the previous value + void* Swap(void* ptr); + + // Atomically compare the stored value with expected. Set the new + // pointer value to thread local only if the comparision is true. + // Otherwise, expected returns the stored value. + // Return true on success, false on failure + bool CompareAndSwap(void* ptr, void*& expected); + + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(autovector* ptrs, void* const replacement); + + protected: + struct Entry { + Entry() : ptr(nullptr) {} + Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {} + std::atomic ptr; + }; + + // This is the structure that is declared as "thread_local" storage. + // The vector keep list of atomic pointer for all instances for "current" + // thread. The vector is indexed by an Id that is unique in process and + // associated with one ThreadLocalPtr instance. The Id is assigned by a + // global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr + // instances, each thread will have a ThreadData with a vector of size 3: + // --------------------------------------------------- + // | | instance 1 | instance 2 | instnace 3 | + // --------------------------------------------------- + // | thread 1 | void* | void* | void* | <- ThreadData + // --------------------------------------------------- + // | thread 2 | void* | void* | void* | <- ThreadData + // --------------------------------------------------- + // | thread 3 | void* | void* | void* | <- ThreadData + // --------------------------------------------------- + struct ThreadData { + ThreadData() : entries() {} + std::vector entries; + ThreadData* next; + ThreadData* prev; + }; + + class StaticMeta { + public: + static StaticMeta* Instance(); + + // Return the next available Id + uint32_t GetId(); + // Return the next availabe Id without claiming it + uint32_t PeekId() const; + // Return the given Id back to the free pool. This also triggers + // UnrefHandler for associated pointer value (if not NULL) for all threads. + void ReclaimId(uint32_t id); + + // Return the pointer value for the given id for the current thread. + void* Get(uint32_t id) const; + // Reset the pointer value for the given id for the current thread. + // It triggers UnrefHanlder if the id has existing pointer value. + void Reset(uint32_t id, void* ptr); + // Atomically swap the supplied ptr and return the previous value + void* Swap(uint32_t id, void* ptr); + // Atomically compare and swap the provided value only if it equals + // to expected value. + bool CompareAndSwap(uint32_t id, void* ptr, void*& expected); + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(uint32_t id, autovector* ptrs, void* const replacement); + + // Register the UnrefHandler for id + void SetHandler(uint32_t id, UnrefHandler handler); + + private: + StaticMeta(); + + // Get UnrefHandler for id with acquiring mutex + // REQUIRES: mutex locked + UnrefHandler GetHandler(uint32_t id); + + // Triggered before a thread terminates + static void OnThreadExit(void* ptr); + + // Add current thread's ThreadData to the global chain + // REQUIRES: mutex locked + void AddThreadData(ThreadData* d); + + // Remove current thread's ThreadData from the global chain + // REQUIRES: mutex locked + void RemoveThreadData(ThreadData* d); + + static ThreadData* GetThreadLocal(); + + // Singleton instance + static std::unique_ptr inst_; + + uint32_t next_instance_id_; + // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed + // frequently. This also prevents it from blowing up the vector space. + autovector free_instance_ids_; + // Chain all thread local structure together. This is necessary since + // when one ThreadLocalPtr gets destroyed, we need to loop over each + // thread's version of pointer corresponding to that instance and + // call UnrefHandler for it. + ThreadData head_; + + std::unordered_map handler_map_; + + // protect inst, next_instance_id_, free_instance_ids_, head_, + // ThreadData.entries + static port::Mutex mutex_; +#if !defined(OS_MACOSX) + // Thread local storage + static __thread ThreadData* tls_; +#endif + // Used to make thread exit trigger possible if !defined(OS_MACOSX). + // Otherwise, used to retrieve thread data. + pthread_key_t pthread_key_; + }; + + const uint32_t id_; +}; + +} // namespace rocksdb diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc new file mode 100644 index 00000000..d273947a --- /dev/null +++ b/util/thread_local_test.cc @@ -0,0 +1,472 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "rocksdb/env.h" +#include "port/port_posix.h" +#include "util/autovector.h" +#include "util/thread_local.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class ThreadLocalTest { + public: + ThreadLocalTest() : env_(Env::Default()) {} + + Env* env_; +}; + +namespace { + +struct Params { + Params(port::Mutex* m, port::CondVar* c, int* unref, int n, + UnrefHandler handler = nullptr) + : mu(m), + cv(c), + unref(unref), + total(n), + started(0), + completed(0), + doWrite(false), + tls1(handler), + tls2(nullptr) {} + + port::Mutex* mu; + port::CondVar* cv; + int* unref; + int total; + int started; + int completed; + bool doWrite; + ThreadLocalPtr tls1; + ThreadLocalPtr* tls2; +}; + +class IDChecker : public ThreadLocalPtr { + public: + static uint32_t PeekId() { return StaticMeta::Instance()->PeekId(); } +}; + +} // anonymous namespace + +TEST(ThreadLocalTest, UniqueIdTest) { + port::Mutex mu; + port::CondVar cv(&mu); + + ASSERT_EQ(IDChecker::PeekId(), 0u); + // New ThreadLocal instance bumps id by 1 + { + // Id used 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Id used 1 + Params p2(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 2u); + // Id used 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // Id used 3 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 4u); + } + // id 3, 2, 1, 0 are in the free queue in order + ASSERT_EQ(IDChecker::PeekId(), 0u); + + // pick up 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 1u); + // pick up 1 + Params* p2 = new Params(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 2u); + // pick up 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // return up 1 + delete p2; + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Now we have 3, 1 in queue + // pick up 1 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), 3u); + // pick up 3 + Params p5(&mu, &cv, nullptr, 1u); + // next new id + ASSERT_EQ(IDChecker::PeekId(), 4u); + // After exit, id sequence in queue: + // 3, 1, 2, 0 +} + +TEST(ThreadLocalTest, SequentialReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + ASSERT_EQ(IDChecker::PeekId(), 0u); + + port::Mutex mu; + port::CondVar cv(&mu); + Params p(&mu, &cv, nullptr, 1); + ThreadLocalPtr tls2; + p.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + p.tls1.Reset(reinterpret_cast(1)); + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(1)); + p.tls1.Reset(reinterpret_cast(2)); + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(2)); + + ASSERT_TRUE(p.tls2->Get() == nullptr); + p.tls2->Reset(reinterpret_cast(1)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(1)); + p.tls2->Reset(reinterpret_cast(2)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(2)); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + p.mu->Unlock(); + }; + + for (int iter = 0; iter < 1024; ++iter) { + ASSERT_EQ(IDChecker::PeekId(), 1u); + // Another new thread, read/write should not see value from previous thread + env_->StartThread(func, static_cast(&p)); + mu.Lock(); + while (p.completed != iter + 1) { + cv.Wait(); + } + mu.Unlock(); + ASSERT_EQ(IDChecker::PeekId(), 1u); + } +} + +TEST(ThreadLocalTest, ConcurrentReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + ASSERT_EQ(IDChecker::PeekId(), 0u); + + ThreadLocalPtr tls2; + port::Mutex mu1; + port::CondVar cv1(&mu1); + Params p1(&mu1, &cv1, nullptr, 16); + p1.tls2 = &tls2; + + port::Mutex mu2; + port::CondVar cv2(&mu2); + Params p2(&mu2, &cv2, nullptr, 16); + p2.doWrite = true; + p2.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + int own = ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + // Let write threads write a different value from the read threads + if (p.doWrite) { + own += 8192; + } + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + auto* env = Env::Default(); + auto start = env->NowMicros(); + + p.tls1.Reset(reinterpret_cast(own)); + p.tls2->Reset(reinterpret_cast(own + 1)); + // Loop for 1 second + while (env->NowMicros() - start < 1000 * 1000) { + for (int iter = 0; iter < 100000; ++iter) { + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(own)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(own + 1)); + if (p.doWrite) { + p.tls1.Reset(reinterpret_cast(own)); + p.tls2->Reset(reinterpret_cast(own + 1)); + } + } + } + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + p.mu->Unlock(); + }; + + // Initiate 2 instnaces: one keeps writing and one keeps reading. + // The read instance should not see data from the write instance. + // Each thread local copy of the value are also different from each + // other. + for (int th = 0; th < p1.total; ++th) { + env_->StartThread(func, static_cast(&p1)); + } + for (int th = 0; th < p2.total; ++th) { + env_->StartThread(func, static_cast(&p2)); + } + + mu1.Lock(); + while (p1.completed != p1.total) { + cv1.Wait(); + } + mu1.Unlock(); + + mu2.Lock(); + while (p2.completed != p2.total) { + cv2.Wait(); + } + mu2.Unlock(); + + ASSERT_EQ(IDChecker::PeekId(), 3u); +} + +TEST(ThreadLocalTest, Unref) { + ASSERT_EQ(IDChecker::PeekId(), 0u); + + auto unref = [](void* ptr) { + auto& p = *static_cast(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + // Case 0: no unref triggered if ThreadLocalPtr is never accessed + auto func0 = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func0, static_cast(&p)); + } + env_->WaitForJoin(); + ASSERT_EQ(unref_count, 0); + } + + // Case 1: unref triggered by thread exit + auto func1 = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + ThreadLocalPtr tls2(unref); + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = &tls2; + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func1, static_cast(&p)); + } + + env_->WaitForJoin(); + + // N threads x 2 ThreadLocal instance cleanup on thread exit + ASSERT_EQ(unref_count, 2 * p.total); + } + + // Case 2: unref triggered by ThreadLocal instance destruction + auto func2 = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func2, static_cast(&p)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + // Now destroy one ThreadLocal instance + delete p.tls2; + p.tls2 = nullptr; + // instance destroy for N threads + ASSERT_EQ(unref_count, p.total); + + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + // additional N threads exit unref for the left instance + ASSERT_EQ(unref_count, 2 * p.total); + } +} + +TEST(ThreadLocalTest, Swap) { + ThreadLocalPtr tls; + tls.Reset(reinterpret_cast(1)); + ASSERT_EQ(reinterpret_cast(tls.Swap(nullptr)), 1); + ASSERT_TRUE(tls.Swap(reinterpret_cast(2)) == nullptr); + ASSERT_EQ(reinterpret_cast(tls.Get()), 2); + ASSERT_EQ(reinterpret_cast(tls.Swap(reinterpret_cast(3))), 2); +} + +TEST(ThreadLocalTest, Scrape) { + auto unref = [](void* ptr) { + auto& p = *static_cast(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThread(func, static_cast(&p)); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + ASSERT_EQ(unref_count, 0); + + // Scrape all thread local data. No unref at thread + // exit or ThreadLocalPtr destruction + autovector ptrs; + p.tls1.Scrape(&ptrs, nullptr); + p.tls2->Scrape(&ptrs, nullptr); + delete p.tls2; + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + + ASSERT_EQ(unref_count, 0); + } +} + +TEST(ThreadLocalTest, CompareAndSwap) { + ThreadLocalPtr tls; + ASSERT_TRUE(tls.Swap(reinterpret_cast(1)) == nullptr); + void* expected = reinterpret_cast(1); + // Swap in 2 + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast(2), expected)); + expected = reinterpret_cast(100); + // Fail Swap, still 2 + ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast(2), expected)); + ASSERT_EQ(expected, reinterpret_cast(2)); + // Swap in 3 + expected = reinterpret_cast(2); + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast(3), expected)); + ASSERT_EQ(tls.Get(), reinterpret_cast(3)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/vectorrep.cc b/util/vectorrep.cc new file mode 100644 index 00000000..14e7c9f9 --- /dev/null +++ b/util/vectorrep.cc @@ -0,0 +1,280 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "rocksdb/memtablerep.h" + +#include +#include +#include +#include +#include + +#include "util/arena.h" +#include "db/memtable.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/stl_wrappers.h" + +namespace rocksdb { +namespace { + +using namespace stl_wrappers; + +class VectorRep : public MemTableRep { + public: + VectorRep(const KeyComparator& compare, Arena* arena, size_t count); + + // Insert key into the collection. (The caller will pack key and value into a + // single buffer and pass that in as the parameter to Insert) + // REQUIRES: nothing that compares equal to key is currently in the + // collection. + virtual void Insert(KeyHandle handle) override; + + // Returns true iff an entry that compares equal to key is in the collection. + virtual bool Contains(const char* key) const override; + + virtual void MarkReadOnly() override; + + virtual size_t ApproximateMemoryUsage() override; + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override; + + virtual ~VectorRep() override { } + + class Iterator : public MemTableRep::Iterator { + class VectorRep* vrep_; + std::shared_ptr> bucket_; + typename std::vector::const_iterator mutable cit_; + const KeyComparator& compare_; + std::string tmp_; // For passing to EncodeKey + bool mutable sorted_; + void DoSort() const; + public: + explicit Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare); + + // Initialize an iterator over the specified collection. + // The returned iterator is not valid. + // explicit Iterator(const MemTableRep* collection); + virtual ~Iterator() override { }; + + // Returns true iff the iterator is positioned at a valid node. + virtual bool Valid() const override; + + // Returns the key at the current position. + // REQUIRES: Valid() + virtual const char* key() const override; + + // Advances to the next position. + // REQUIRES: Valid() + virtual void Next() override; + + // Advances to the previous position. + // REQUIRES: Valid() + virtual void Prev() override; + + // Advance to the first entry with a key >= target + virtual void Seek(const Slice& user_key, const char* memtable_key) override; + + // Position at the first entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToFirst() override; + + // Position at the last entry in collection. + // Final state of iterator is Valid() iff collection is not empty. + virtual void SeekToLast() override; + }; + + // Unhide default implementations of GetIterator() + using MemTableRep::GetIterator; + + // Return an iterator over the keys in this representation. + virtual MemTableRep::Iterator* GetIterator() override; + + private: + friend class Iterator; + typedef std::vector Bucket; + std::shared_ptr bucket_; + mutable port::RWMutex rwlock_; + bool immutable_; + bool sorted_; + const KeyComparator& compare_; +}; + +void VectorRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); + assert(!Contains(key)); + WriteLock l(&rwlock_); + assert(!immutable_); + bucket_->push_back(key); +} + +// Returns true iff an entry that compares equal to key is in the collection. +bool VectorRep::Contains(const char* key) const { + ReadLock l(&rwlock_); + return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end(); +} + +void VectorRep::MarkReadOnly() { + WriteLock l(&rwlock_); + immutable_ = true; +} + +size_t VectorRep::ApproximateMemoryUsage() { + return + sizeof(bucket_) + sizeof(*bucket_) + + bucket_->size() * + sizeof( + std::remove_reference::type::value_type + ); +} + +VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) + : MemTableRep(arena), + bucket_(new Bucket()), + immutable_(false), + sorted_(false), + compare_(compare) { bucket_.get()->reserve(count); } + +VectorRep::Iterator::Iterator(class VectorRep* vrep, + std::shared_ptr> bucket, + const KeyComparator& compare) +: vrep_(vrep), + bucket_(bucket), + cit_(bucket_->end()), + compare_(compare), + sorted_(false) { } + +void VectorRep::Iterator::DoSort() const { + // vrep is non-null means that we are working on an immutable memtable + if (!sorted_ && vrep_ != nullptr) { + WriteLock l(&vrep_->rwlock_); + if (!vrep_->sorted_) { + std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + cit_ = bucket_->begin(); + vrep_->sorted_ = true; + } + sorted_ = true; + } + if (!sorted_) { + std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + cit_ = bucket_->begin(); + sorted_ = true; + } + assert(sorted_); + assert(vrep_ == nullptr || vrep_->sorted_); +} + +// Returns true iff the iterator is positioned at a valid node. +bool VectorRep::Iterator::Valid() const { + DoSort(); + return cit_ != bucket_->end(); +} + +// Returns the key at the current position. +// REQUIRES: Valid() +const char* VectorRep::Iterator::key() const { + assert(Valid()); + return *cit_; +} + +// Advances to the next position. +// REQUIRES: Valid() +void VectorRep::Iterator::Next() { + assert(Valid()); + if (cit_ == bucket_->end()) { + return; + } + ++cit_; +} + +// Advances to the previous position. +// REQUIRES: Valid() +void VectorRep::Iterator::Prev() { + assert(Valid()); + if (cit_ == bucket_->begin()) { + // If you try to go back from the first element, the iterator should be + // invalidated. So we set it to past-the-end. This means that you can + // treat the container circularly. + cit_ = bucket_->end(); + } else { + --cit_; + } +} + +// Advance to the first entry with a key >= target +void VectorRep::Iterator::Seek(const Slice& user_key, + const char* memtable_key) { + DoSort(); + // Do binary search to find first value not less than the target + const char* encoded_key = + (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key); + cit_ = std::equal_range(bucket_->begin(), + bucket_->end(), + encoded_key, + [this] (const char* a, const char* b) { + return compare_(a, b) < 0; + }).first; +} + +// Position at the first entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToFirst() { + DoSort(); + cit_ = bucket_->begin(); +} + +// Position at the last entry in collection. +// Final state of iterator is Valid() iff collection is not empty. +void VectorRep::Iterator::SeekToLast() { + DoSort(); + cit_ = bucket_->end(); + if (bucket_->size() != 0) { + --cit_; + } +} + +void VectorRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + rwlock_.ReadLock(); + VectorRep* vector_rep; + std::shared_ptr bucket; + if (immutable_) { + vector_rep = this; + } else { + vector_rep = nullptr; + bucket.reset(new Bucket(*bucket_)); // make a copy + } + VectorRep::Iterator iter(vector_rep, immutable_ ? bucket_ : bucket, compare_); + rwlock_.Unlock(); + + for (iter.Seek(k.user_key(), k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { + } +} + +MemTableRep::Iterator* VectorRep::GetIterator() { + ReadLock l(&rwlock_); + // Do not sort here. The sorting would be done the first time + // a Seek is performed on the iterator. + if (immutable_) { + return new Iterator(this, bucket_, compare_); + } else { + std::shared_ptr tmp; + tmp.reset(new Bucket(*bucket_)); // make a copy + return new Iterator(nullptr, tmp, compare_); + } +} +} // anon namespace + +MemTableRep* VectorRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform*) { + return new VectorRep(compare, arena, count_); +} +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc new file mode 100644 index 00000000..32c3b848 --- /dev/null +++ b/utilities/backupable/backupable_db.cc @@ -0,0 +1,1175 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "utilities/backupable_db.h" +#include "db/filename.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "rocksdb/transaction_log.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +namespace { +class RateLimiter { + public: + RateLimiter(Env* env, uint64_t max_bytes_per_second, uint64_t bytes_per_check) + : env_(env), + max_bytes_per_second_(max_bytes_per_second), + bytes_per_check_(bytes_per_check), + micros_start_time_(env->NowMicros()), + bytes_since_start_(0) {} + + void ReportAndWait(uint64_t bytes_since_last_call) { + bytes_since_start_ += bytes_since_last_call; + if (bytes_since_start_ < bytes_per_check_) { + // not enough bytes to be rate-limited + return; + } + + uint64_t now = env_->NowMicros(); + uint64_t interval = now - micros_start_time_; + uint64_t should_take_micros = + (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_; + + if (should_take_micros > interval) { + env_->SleepForMicroseconds(should_take_micros - interval); + now = env_->NowMicros(); + } + // reset interval + micros_start_time_ = now; + bytes_since_start_ = 0; + } + + private: + Env* env_; + uint64_t max_bytes_per_second_; + uint64_t bytes_per_check_; + uint64_t micros_start_time_; + uint64_t bytes_since_start_; + static const uint64_t kMicrosInSecond = 1000 * 1000LL; +}; +} // namespace + +void BackupableDBOptions::Dump(Logger* logger) const { + Log(logger, " Options.backup_dir: %s", backup_dir.c_str()); + Log(logger, " Options.backup_env: %p", backup_env); + Log(logger, " Options.share_table_files: %d", + static_cast(share_table_files)); + Log(logger, " Options.info_log: %p", info_log); + Log(logger, " Options.sync: %d", static_cast(sync)); + Log(logger, " Options.destroy_old_data: %d", + static_cast(destroy_old_data)); + Log(logger, " Options.backup_log_files: %d", + static_cast(backup_log_files)); + Log(logger, " Options.backup_rate_limit: %" PRIu64, backup_rate_limit); + Log(logger, "Options.restore_rate_limit: %" PRIu64, restore_rate_limit); +} + +// -------- BackupEngineImpl class --------- +class BackupEngineImpl : public BackupEngine { + public: + BackupEngineImpl(Env* db_env, const BackupableDBOptions& options); + ~BackupEngineImpl(); + Status CreateNewBackup(DB* db, bool flush_before_backup = false); + Status PurgeOldBackups(uint32_t num_backups_to_keep); + Status DeleteBackup(BackupID backup_id); + void StopBackup() { + stop_backup_.store(true, std::memory_order_release); + } + + void GetBackupInfo(std::vector* backup_info); + Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()); + Status RestoreDBFromLatestBackup(const std::string& db_dir, + const std::string& wal_dir, + const RestoreOptions& restore_options = + RestoreOptions()) { + return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir, + restore_options); + } + + private: + void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0); + + struct FileInfo { + FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum) + : refs(0), filename(fname), size(sz), checksum_value(checksum) {} + + int refs; + const std::string filename; + const uint64_t size; + uint32_t checksum_value; + }; + + class BackupMeta { + public: + BackupMeta(const std::string& meta_filename, + std::unordered_map* file_infos, Env* env) + : timestamp_(0), size_(0), meta_filename_(meta_filename), + file_infos_(file_infos), env_(env) {} + + ~BackupMeta() {} + + void RecordTimestamp() { + env_->GetCurrentTime(×tamp_); + } + int64_t GetTimestamp() const { + return timestamp_; + } + uint64_t GetSize() const { + return size_; + } + void SetSequenceNumber(uint64_t sequence_number) { + sequence_number_ = sequence_number; + } + uint64_t GetSequenceNumber() { + return sequence_number_; + } + + Status AddFile(const FileInfo& file_info); + + void Delete(); + + bool Empty() { + return files_.empty(); + } + + const std::vector& GetFiles() { + return files_; + } + + Status LoadFromFile(const std::string& backup_dir); + Status StoreToFile(bool sync); + + private: + int64_t timestamp_; + // sequence number is only approximate, should not be used + // by clients + uint64_t sequence_number_; + uint64_t size_; + std::string const meta_filename_; + // files with relative paths (without "/" prefix!!) + std::vector files_; + std::unordered_map* file_infos_; + Env* env_; + + static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB + }; // BackupMeta + + inline std::string GetAbsolutePath( + const std::string &relative_path = "") const { + assert(relative_path.size() == 0 || relative_path[0] != '/'); + return options_.backup_dir + "/" + relative_path; + } + inline std::string GetPrivateDirRel() const { + return "private"; + } + inline std::string GetPrivateFileRel(BackupID backup_id, + bool tmp = false, + const std::string& file = "") const { + assert(file.size() == 0 || file[0] != '/'); + return GetPrivateDirRel() + "/" + std::to_string(backup_id) + + (tmp ? ".tmp" : "") + "/" + file; + } + inline std::string GetSharedFileRel(const std::string& file = "", + bool tmp = false) const { + assert(file.size() == 0 || file[0] != '/'); + return "shared/" + file + (tmp ? ".tmp" : ""); + } + inline std::string GetLatestBackupFile(bool tmp = false) const { + return GetAbsolutePath(std::string("LATEST_BACKUP") + (tmp ? ".tmp" : "")); + } + inline std::string GetBackupMetaDir() const { + return GetAbsolutePath("meta"); + } + inline std::string GetBackupMetaFile(BackupID backup_id) const { + return GetBackupMetaDir() + "/" + std::to_string(backup_id); + } + + Status GetLatestBackupFileContents(uint32_t* latest_backup); + Status PutLatestBackupFileContents(uint32_t latest_backup); + // if size_limit == 0, there is no size limit, copy everything + Status CopyFile(const std::string& src, + const std::string& dst, + Env* src_env, + Env* dst_env, + bool sync, + RateLimiter* rate_limiter, + uint64_t* size = nullptr, + uint32_t* checksum_value = nullptr, + uint64_t size_limit = 0); + // if size_limit == 0, there is no size limit, copy everything + Status BackupFile(BackupID backup_id, + BackupMeta* backup, + bool shared, + const std::string& src_dir, + const std::string& src_fname, // starts with "/" + RateLimiter* rate_limiter, + uint64_t size_limit = 0); + + Status CalculateChecksum(const std::string& src, + Env* src_env, + uint64_t size_limit, + uint32_t* checksum_value); + + // Will delete all the files we don't need anymore + // If full_scan == true, it will do the full scan of files/ directory + // and delete all the files that are not referenced from backuped_file_infos__ + void GarbageCollection(bool full_scan); + + // backup state data + BackupID latest_backup_id_; + std::map backups_; + std::unordered_map backuped_file_infos_; + std::vector obsolete_backups_; + std::atomic stop_backup_; + + // options data + BackupableDBOptions options_; + Env* db_env_; + Env* backup_env_; + + // directories + unique_ptr backup_directory_; + unique_ptr shared_directory_; + unique_ptr meta_directory_; + unique_ptr private_directory_; + + static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB + size_t copy_file_buffer_size_; +}; + +BackupEngine* BackupEngine::NewBackupEngine( + Env* db_env, const BackupableDBOptions& options) { + return new BackupEngineImpl(db_env, options); +} + +BackupEngineImpl::BackupEngineImpl(Env* db_env, + const BackupableDBOptions& options) + : stop_backup_(false), + options_(options), + db_env_(db_env), + backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_), + copy_file_buffer_size_(kDefaultCopyFileBufferSize) { + options_.Dump(options_.info_log); + + // create all the dirs we need + backup_env_->CreateDirIfMissing(GetAbsolutePath()); + backup_env_->NewDirectory(GetAbsolutePath(), &backup_directory_); + if (options_.share_table_files) { + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel())); + backup_env_->NewDirectory(GetAbsolutePath(GetSharedFileRel()), + &shared_directory_); + } + backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel())); + backup_env_->NewDirectory(GetAbsolutePath(GetPrivateDirRel()), + &private_directory_); + backup_env_->CreateDirIfMissing(GetBackupMetaDir()); + backup_env_->NewDirectory(GetBackupMetaDir(), &meta_directory_); + + std::vector backup_meta_files; + backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files); + // create backups_ structure + for (auto& file : backup_meta_files) { + BackupID backup_id = 0; + sscanf(file.c_str(), "%u", &backup_id); + if (backup_id == 0 || file != std::to_string(backup_id)) { + // invalid file name, delete that + backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file); + continue; + } + assert(backups_.find(backup_id) == backups_.end()); + backups_.insert(std::make_pair( + backup_id, BackupMeta(GetBackupMetaFile(backup_id), + &backuped_file_infos_, backup_env_))); + } + + if (options_.destroy_old_data) { // Destory old data + for (auto& backup : backups_) { + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + backups_.clear(); + // start from beginning + latest_backup_id_ = 0; + // GarbageCollection() will do the actual deletion + } else { // Load data from storage + // load the backups if any + for (auto& backup : backups_) { + Status s = backup.second.LoadFromFile(options_.backup_dir); + if (!s.ok()) { + Log(options_.info_log, "Backup %u corrupted - deleting -- %s", + backup.first, s.ToString().c_str()); + backup.second.Delete(); + obsolete_backups_.push_back(backup.first); + } + } + // delete obsolete backups from the structure + for (auto ob : obsolete_backups_) { + backups_.erase(ob); + } + + Status s = GetLatestBackupFileContents(&latest_backup_id_); + // If latest backup file is corrupted or non-existent + // set latest backup as the biggest backup we have + // or 0 if we have no backups + if (!s.ok() || + backups_.find(latest_backup_id_) == backups_.end()) { + auto itr = backups_.end(); + latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first; + } + } + + // delete any backups that claim to be later than latest + for (auto itr = backups_.upper_bound(latest_backup_id_); + itr != backups_.end();) { + itr->second.Delete(); + obsolete_backups_.push_back(itr->first); + itr = backups_.erase(itr); + } + + PutLatestBackupFileContents(latest_backup_id_); // Ignore errors + GarbageCollection(true); + Log(options_.info_log, + "Initialized BackupEngine, the latest backup is %u.", + latest_backup_id_); +} + +BackupEngineImpl::~BackupEngineImpl() { LogFlush(options_.info_log); } + +Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { + Status s; + std::vector live_files; + VectorLogPtr live_wal_files; + uint64_t manifest_file_size = 0; + uint64_t sequence_number = db->GetLatestSequenceNumber(); + + s = db->DisableFileDeletions(); + if (s.ok()) { + // this will return live_files prefixed with "/" + s = db->GetLiveFiles(live_files, &manifest_file_size, flush_before_backup); + } + // if we didn't flush before backup, we need to also get WAL files + if (s.ok() && !flush_before_backup && options_.backup_log_files) { + // returns file names prefixed with "/" + s = db->GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + db->EnableFileDeletions(); + return s; + } + + BackupID new_backup_id = latest_backup_id_ + 1; + assert(backups_.find(new_backup_id) == backups_.end()); + auto ret = backups_.insert(std::make_pair( + new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id), + &backuped_file_infos_, backup_env_))); + assert(ret.second == true); + auto& new_backup = ret.first->second; + new_backup.RecordTimestamp(); + new_backup.SetSequenceNumber(sequence_number); + + Log(options_.info_log, "Started the backup process -- creating backup %u", + new_backup_id); + + // create temporary private dir + s = backup_env_->CreateDir( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true))); + + unique_ptr rate_limiter; + if (options_.backup_rate_limit > 0) { + copy_file_buffer_size_ = options_.backup_rate_limit / 10; + rate_limiter.reset(new RateLimiter(db_env_, options_.backup_rate_limit, + copy_file_buffer_size_)); + } + + // copy live_files + for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + uint64_t number; + FileType type; + bool ok = ParseFileName(live_files[i], &number, &type); + if (!ok) { + assert(false); + return Status::Corruption("Can't parse file name. This is very bad"); + } + // we should only get sst, manifest and current files here + assert(type == kTableFile || type == kDescriptorFile || + type == kCurrentFile); + + // rules: + // * if it's kTableFile, than it's shared + // * if it's kDescriptorFile, limit the size to manifest_file_size + s = BackupFile(new_backup_id, + &new_backup, + options_.share_table_files && type == kTableFile, + db->GetName(), /* src_dir */ + live_files[i], /* src_fname */ + rate_limiter.get(), + (type == kDescriptorFile) ? manifest_file_size : 0); + } + + // copy WAL files + for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) { + if (live_wal_files[i]->Type() == kAliveLogFile) { + // we only care about live log files + // copy the file into backup_dir/files// + s = BackupFile(new_backup_id, + &new_backup, + false, /* not shared */ + db->GetOptions().wal_dir, + live_wal_files[i]->PathName(), + rate_limiter.get()); + } + } + + // we copied all the files, enable file deletions + db->EnableFileDeletions(); + + if (s.ok()) { + // move tmp private backup to real backup folder + s = backup_env_->RenameFile( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)), // tmp + GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); + } + + if (s.ok()) { + // persist the backup metadata on the disk + s = new_backup.StoreToFile(options_.sync); + } + if (s.ok()) { + // install the newly created backup meta! (atomic) + s = PutLatestBackupFileContents(new_backup_id); + } + if (s.ok() && options_.sync) { + unique_ptr backup_private_directory; + backup_env_->NewDirectory( + GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)), + &backup_private_directory); + if (backup_private_directory != nullptr) { + backup_private_directory->Fsync(); + } + if (private_directory_ != nullptr) { + private_directory_->Fsync(); + } + if (meta_directory_ != nullptr) { + meta_directory_->Fsync(); + } + if (shared_directory_ != nullptr) { + shared_directory_->Fsync(); + } + if (backup_directory_ != nullptr) { + backup_directory_->Fsync(); + } + } + + if (!s.ok()) { + // clean all the files we might have created + Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + backups_.erase(new_backup_id); + GarbageCollection(true); + return s; + } + + // here we know that we succeeded and installed the new backup + // in the LATEST_BACKUP file + latest_backup_id_ = new_backup_id; + Log(options_.info_log, "Backup DONE. All is good"); + return s; +} + +Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { + Log(options_.info_log, "Purging old backups, keeping %u", + num_backups_to_keep); + while (num_backups_to_keep < backups_.size()) { + Log(options_.info_log, "Deleting backup %u", backups_.begin()->first); + backups_.begin()->second.Delete(); + obsolete_backups_.push_back(backups_.begin()->first); + backups_.erase(backups_.begin()); + } + GarbageCollection(false); + return Status::OK(); +} + +Status BackupEngineImpl::DeleteBackup(BackupID backup_id) { + Log(options_.info_log, "Deleting backup %u", backup_id); + auto backup = backups_.find(backup_id); + if (backup == backups_.end()) { + return Status::NotFound("Backup not found"); + } + backup->second.Delete(); + obsolete_backups_.push_back(backup_id); + backups_.erase(backup); + GarbageCollection(false); + return Status::OK(); +} + +void BackupEngineImpl::GetBackupInfo(std::vector* backup_info) { + backup_info->reserve(backups_.size()); + for (auto& backup : backups_) { + if (!backup.second.Empty()) { + backup_info->push_back(BackupInfo( + backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + } + } +} + +Status BackupEngineImpl::RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options) { + auto backup_itr = backups_.find(backup_id); + if (backup_itr == backups_.end()) { + return Status::NotFound("Backup not found"); + } + auto& backup = backup_itr->second; + if (backup.Empty()) { + return Status::NotFound("Backup not found"); + } + + Log(options_.info_log, "Restoring backup id %u\n", backup_id); + Log(options_.info_log, "keep_log_files: %d\n", + static_cast(restore_options.keep_log_files)); + + // just in case. Ignore errors + db_env_->CreateDirIfMissing(db_dir); + db_env_->CreateDirIfMissing(wal_dir); + + if (restore_options.keep_log_files) { + // delete files in db_dir, but keep all the log files + DeleteChildren(db_dir, 1 << kLogFile); + // move all the files from archive dir to wal_dir + std::string archive_dir = ArchivalDirectory(wal_dir); + std::vector archive_files; + db_env_->GetChildren(archive_dir, &archive_files); // ignore errors + for (const auto& f : archive_files) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kLogFile) { + Log(options_.info_log, "Moving log file from archive/ to wal_dir: %s", + f.c_str()); + Status s = + db_env_->RenameFile(archive_dir + "/" + f, wal_dir + "/" + f); + if (!s.ok()) { + // if we can't move log file from archive_dir to wal_dir, + // we should fail, since it might mean data loss + return s; + } + } + } + } else { + DeleteChildren(wal_dir); + DeleteChildren(ArchivalDirectory(wal_dir)); + DeleteChildren(db_dir); + } + + unique_ptr rate_limiter; + if (options_.restore_rate_limit > 0) { + copy_file_buffer_size_ = options_.restore_rate_limit / 10; + rate_limiter.reset(new RateLimiter(db_env_, options_.restore_rate_limit, + copy_file_buffer_size_)); + } + Status s; + for (auto& file : backup.GetFiles()) { + std::string dst; + // 1. extract the filename + size_t slash = file.find_last_of('/'); + // file will either be shared/ or private// + assert(slash != std::string::npos); + dst = file.substr(slash + 1); + + // 2. find the filetype + uint64_t number; + FileType type; + bool ok = ParseFileName(dst, &number, &type); + if (!ok) { + return Status::Corruption("Backup corrupted"); + } + // 3. Construct the final path + // kLogFile lives in wal_dir and all the rest live in db_dir + dst = ((type == kLogFile) ? wal_dir : db_dir) + + "/" + dst; + + Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); + uint32_t checksum_value; + s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false, + rate_limiter.get(), nullptr /* size */, &checksum_value); + if (!s.ok()) { + break; + } + + const auto iter = backuped_file_infos_.find(file); + assert(iter != backuped_file_infos_.end()); + if (iter->second.checksum_value != checksum_value) { + s = Status::Corruption("Checksum check failed"); + break; + } + } + + Log(options_.info_log, "Restoring done -- %s\n", s.ToString().c_str()); + return s; +} + +// latest backup id is an ASCII representation of latest backup id +Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) { + Status s; + unique_ptr file; + s = backup_env_->NewSequentialFile(GetLatestBackupFile(), + &file, + EnvOptions()); + if (!s.ok()) { + return s; + } + + char buf[11]; + Slice data; + s = file->Read(10, &data, buf); + if (!s.ok() || data.size() == 0) { + return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; + } + buf[data.size()] = 0; + + *latest_backup = 0; + sscanf(data.data(), "%u", latest_backup); + if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) { + s = Status::Corruption("Latest backup file corrupted"); + } + return Status::OK(); +} + +// this operation HAS to be atomic +// writing 4 bytes to the file is atomic alright, but we should *never* +// do something like 1. delete file, 2. write new file +// We write to a tmp file and then atomically rename +Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) { + Status s; + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = backup_env_->NewWritableFile(GetLatestBackupFile(true), + &file, + env_options); + if (!s.ok()) { + backup_env_->DeleteFile(GetLatestBackupFile(true)); + return s; + } + + char file_contents[10]; + int len = sprintf(file_contents, "%u\n", latest_backup); + s = file->Append(Slice(file_contents, len)); + if (s.ok() && options_.sync) { + file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + if (s.ok()) { + // atomically replace real file with new tmp + s = backup_env_->RenameFile(GetLatestBackupFile(true), + GetLatestBackupFile(false)); + } + return s; +} + +Status BackupEngineImpl::CopyFile(const std::string& src, + const std::string& dst, Env* src_env, + Env* dst_env, bool sync, + RateLimiter* rate_limiter, uint64_t* size, + uint32_t* checksum_value, + uint64_t size_limit) { + Status s; + unique_ptr dst_file; + unique_ptr src_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.use_os_buffer = false; + if (size != nullptr) { + *size = 0; + } + if (checksum_value != nullptr) { + *checksum_value = 0; + } + + // Check if size limit is set. if not, set it to very big number + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + s = src_env->NewSequentialFile(src, &src_file, env_options); + if (s.ok()) { + s = dst_env->NewWritableFile(dst, &dst_file, env_options); + } + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return Status::Incomplete("Backup stopped"); + } + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf.get()); + size_limit -= data.size(); + + if (!s.ok()) { + return s; + } + + if (size != nullptr) { + *size += data.size(); + } + if (checksum_value != nullptr) { + *checksum_value = crc32c::Extend(*checksum_value, data.data(), + data.size()); + } + s = dst_file->Append(data); + if (rate_limiter != nullptr) { + rate_limiter->ReportAndWait(data.size()); + } + } while (s.ok() && data.size() > 0 && size_limit > 0); + + if (s.ok() && sync) { + s = dst_file->Sync(); + } + + return s; +} + +// src_fname will always start with "/" +Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup, + bool shared, const std::string& src_dir, + const std::string& src_fname, + RateLimiter* rate_limiter, + uint64_t size_limit) { + + assert(src_fname.size() > 0 && src_fname[0] == '/'); + std::string dst_relative = src_fname.substr(1); + std::string dst_relative_tmp; + if (shared) { + dst_relative_tmp = GetSharedFileRel(dst_relative, true); + dst_relative = GetSharedFileRel(dst_relative, false); + } else { + dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative); + dst_relative = GetPrivateFileRel(backup_id, false, dst_relative); + } + std::string dst_path = GetAbsolutePath(dst_relative); + std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp); + Status s; + uint64_t size; + + // if it's shared, we also need to check if it exists -- if it does, + // no need to copy it again + uint32_t checksum_value = 0; + if (shared && backup_env_->FileExists(dst_path)) { + backup_env_->GetFileSize(dst_path, &size); // Ignore error + Log(options_.info_log, "%s already present, calculate checksum", + src_fname.c_str()); + s = CalculateChecksum(src_dir + src_fname, + db_env_, + size_limit, + &checksum_value); + } else { + Log(options_.info_log, "Copying %s", src_fname.c_str()); + s = CopyFile(src_dir + src_fname, + dst_path_tmp, + db_env_, + backup_env_, + options_.sync, + rate_limiter, + &size, + &checksum_value, + size_limit); + if (s.ok() && shared) { + s = backup_env_->RenameFile(dst_path_tmp, dst_path); + } + } + if (s.ok()) { + s = backup->AddFile(FileInfo(dst_relative, size, checksum_value)); + } + return s; +} + +Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env, + uint64_t size_limit, + uint32_t* checksum_value) { + *checksum_value = 0; + if (size_limit == 0) { + size_limit = std::numeric_limits::max(); + } + + EnvOptions env_options; + env_options.use_mmap_writes = false; + env_options.use_os_buffer = false; + + std::unique_ptr src_file; + Status s = src_env->NewSequentialFile(src, &src_file, env_options); + if (!s.ok()) { + return s; + } + + std::unique_ptr buf(new char[copy_file_buffer_size_]); + Slice data; + + do { + if (stop_backup_.load(std::memory_order_acquire)) { + return Status::Incomplete("Backup stopped"); + } + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? + copy_file_buffer_size_ : size_limit; + s = src_file->Read(buffer_to_read, &data, buf.get()); + + if (!s.ok()) { + return s; + } + + size_limit -= data.size(); + *checksum_value = crc32c::Extend(*checksum_value, data.data(), data.size()); + } while (data.size() > 0 && size_limit > 0); + + return s; +} + +void BackupEngineImpl::DeleteChildren(const std::string& dir, + uint32_t file_type_filter) { + std::vector children; + db_env_->GetChildren(dir, &children); // ignore errors + + for (const auto& f : children) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && (file_type_filter & (1 << type))) { + // don't delete this file + continue; + } + db_env_->DeleteFile(dir + "/" + f); // ignore errors + } +} + +void BackupEngineImpl::GarbageCollection(bool full_scan) { + Log(options_.info_log, "Starting garbage collection"); + std::vector to_delete; + for (auto& itr : backuped_file_infos_) { + if (itr.second.refs == 0) { + Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); + Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + s.ToString().c_str()); + to_delete.push_back(itr.first); + } + } + for (auto& td : to_delete) { + backuped_file_infos_.erase(td); + } + if (!full_scan) { + // take care of private dirs -- if full_scan == true, then full_scan will + // take care of them + for (auto backup_id : obsolete_backups_) { + std::string private_dir = GetPrivateFileRel(backup_id); + Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); + Log(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), s.ToString().c_str()); + } + } + obsolete_backups_.clear(); + + if (full_scan) { + Log(options_.info_log, "Starting full scan garbage collection"); + // delete obsolete shared files + std::vector shared_children; + backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), + &shared_children); + for (auto& child : shared_children) { + std::string rel_fname = GetSharedFileRel(child); + // if it's not refcounted, delete it + if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", rel_fname.c_str()); + } + } + } + + // delete obsolete private files + std::vector private_children; + backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), + &private_children); + for (auto& child : private_children) { + BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; + sscanf(child.c_str(), "%u", &backup_id); + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir)); + std::vector subchildren; + backup_env_->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s = backup_env_->DeleteFile(full_private_path + subchild); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", + (full_private_path + subchild).c_str()); + } + } + // finally delete the private dir + Status s = backup_env_->DeleteDir(full_private_path); + Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); + } + } +} + +// ------- BackupMeta class -------- + +Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) { + size_ += file_info.size; + files_.push_back(file_info.filename); + + auto itr = file_infos_->find(file_info.filename); + if (itr == file_infos_->end()) { + auto ret = file_infos_->insert({file_info.filename, file_info}); + if (ret.second) { + ret.first->second.refs = 1; + } else { + // if this happens, something is seriously wrong + return Status::Corruption("In memory metadata insertion error"); + } + } else { + if (itr->second.checksum_value != file_info.checksum_value) { + return Status::Corruption("Checksum mismatch for existing backup file"); + } + ++itr->second.refs; // increase refcount if already present + } + + return Status::OK(); +} + +void BackupEngineImpl::BackupMeta::Delete() { + for (const auto& file : files_) { + auto itr = file_infos_->find(file); + assert(itr != file_infos_->end()); + --(itr->second.refs); // decrease refcount + } + files_.clear(); + // delete meta file + env_->DeleteFile(meta_filename_); + timestamp_ = 0; +} + +// each backup meta file is of the format: +// +// +// +// +// +// ... +Status BackupEngineImpl::BackupMeta::LoadFromFile( + const std::string& backup_dir) { + assert(Empty()); + Status s; + unique_ptr backup_meta_file; + s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions()); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); + Slice data; + s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); + + if (!s.ok() || data.size() == max_backup_meta_file_size_) { + return s.ok() ? Status::Corruption("File size too big") : s; + } + buf[data.size()] = 0; + + uint32_t num_files = 0; + int bytes_read = 0; + sscanf(data.data(), "%" PRId64 "%n", ×tamp_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + sscanf(data.data(), "%u%n", &num_files, &bytes_read); + data.remove_prefix(bytes_read + 1); // +1 for '\n' + + std::vector files; + + for (uint32_t i = 0; s.ok() && i < num_files; ++i) { + auto line = GetSliceUntil(&data, '\n'); + std::string filename = GetSliceUntil(&line, ' ').ToString(); + + uint64_t size; + s = env_->GetFileSize(backup_dir + "/" + filename, &size); + if (!s.ok()) { + return s; + } + + if (line.empty()) { + return Status::Corruption("File checksum is missing"); + } + + uint32_t checksum_value = 0; + if (line.starts_with("crc32 ")) { + line.remove_prefix(6); + sscanf(line.data(), "%u", &checksum_value); + if (memcmp(line.data(), std::to_string(checksum_value).c_str(), + line.size() - 1) != 0) { + return Status::Corruption("Invalid checksum value"); + } + } else { + return Status::Corruption("Unknown checksum type"); + } + + files.emplace_back(filename, size, checksum_value); + } + + if (s.ok() && data.size() > 0) { + // file has to be read completely. if not, we count it as corruption + s = Status::Corruption("Tailing data in backup meta file"); + } + + if (s.ok()) { + for (const auto& file_info : files) { + s = AddFile(file_info); + if (!s.ok()) { + break; + } + } + } + + return s; +} + +Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) { + Status s; + unique_ptr backup_meta_file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file, + env_options); + if (!s.ok()) { + return s; + } + + unique_ptr buf(new char[max_backup_meta_file_size_]); + int len = 0, buf_size = max_backup_meta_file_size_; + len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_); + len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n", + sequence_number_); + len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); + for (const auto& file : files_) { + const auto& iter = file_infos_->find(file); + + assert(iter != file_infos_->end()); + // use crc32 for now, switch to something else if needed + len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n", + file.c_str(), iter->second.checksum_value); + } + + s = backup_meta_file->Append(Slice(buf.get(), (size_t)len)); + if (s.ok() && sync) { + s = backup_meta_file->Sync(); + } + if (s.ok()) { + s = backup_meta_file->Close(); + } + if (s.ok()) { + s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_); + } + return s; +} + +// --- BackupableDB methods -------- + +BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options) + : StackableDB(db), + backup_engine_(new BackupEngineImpl(db->GetEnv(), options)) {} + +BackupableDB::~BackupableDB() { + delete backup_engine_; +} + +Status BackupableDB::CreateNewBackup(bool flush_before_backup) { + return backup_engine_->CreateNewBackup(this, flush_before_backup); +} + +void BackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status BackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +void BackupableDB::StopBackup() { + backup_engine_->StopBackup(); +} + +// --- RestoreBackupableDB methods ------ + +RestoreBackupableDB::RestoreBackupableDB(Env* db_env, + const BackupableDBOptions& options) + : backup_engine_(new BackupEngineImpl(db_env, options)) {} + +RestoreBackupableDB::~RestoreBackupableDB() { + delete backup_engine_; +} + +void +RestoreBackupableDB::GetBackupInfo(std::vector* backup_info) { + backup_engine_->GetBackupInfo(backup_info); +} + +Status RestoreBackupableDB::RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options) { + return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir, + restore_options); +} + +Status RestoreBackupableDB::RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& restore_options) { + return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir, + restore_options); +} + +Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { + return backup_engine_->PurgeOldBackups(num_backups_to_keep); +} + +Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) { + return backup_engine_->DeleteBackup(backup_id); +} + +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc new file mode 100644 index 00000000..6b10c941 --- /dev/null +++ b/utilities/backupable/backupable_db_test.cc @@ -0,0 +1,853 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "utilities/utility_db.h" +#include "utilities/backupable_db.h" +#include "util/testharness.h" +#include "util/random.h" +#include "util/testutil.h" +#include "util/auto_roll_logger.h" + +#include +#include + +namespace rocksdb { + +namespace { + +using std::unique_ptr; + +class DummyDB : public StackableDB { + public: + /* implicit */ + DummyDB(const Options& options, const std::string& dbname) + : StackableDB(nullptr), options_(options), dbname_(dbname), + deletions_enabled_(true), sequence_number_(0) {} + + virtual SequenceNumber GetLatestSequenceNumber() const { + return ++sequence_number_; + } + + virtual const std::string& GetName() const override { + return dbname_; + } + + virtual Env* GetEnv() const override { + return options_.env; + } + + virtual const Options& GetOptions() const override { + return options_; + } + + virtual Status EnableFileDeletions(bool force) override { + ASSERT_TRUE(!deletions_enabled_); + deletions_enabled_ = true; + return Status::OK(); + } + + virtual Status DisableFileDeletions() override { + ASSERT_TRUE(deletions_enabled_); + deletions_enabled_ = false; + return Status::OK(); + } + + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, + bool flush_memtable = true) override { + ASSERT_TRUE(!deletions_enabled_); + vec = live_files_; + *mfs = 100; + return Status::OK(); + } + + class DummyLogFile : public LogFile { + public: + /* implicit */ + DummyLogFile(const std::string& path, bool alive = true) + : path_(path), alive_(alive) {} + + virtual std::string PathName() const override { + return path_; + } + + virtual uint64_t LogNumber() const { + // what business do you have calling this method? + ASSERT_TRUE(false); + return 0; + } + + virtual WalFileType Type() const override { + return alive_ ? kAliveLogFile : kArchivedLogFile; + } + + virtual SequenceNumber StartSequence() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + virtual uint64_t SizeFileBytes() const { + // backupabledb should not need this method + ASSERT_TRUE(false); + return 0; + } + + private: + std::string path_; + bool alive_; + }; // DummyLogFile + + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { + ASSERT_TRUE(!deletions_enabled_); + files.resize(wal_files_.size()); + for (size_t i = 0; i < files.size(); ++i) { + files[i].reset( + new DummyLogFile(wal_files_[i].first, wal_files_[i].second)); + } + return Status::OK(); + } + + std::vector live_files_; + // pair + std::vector> wal_files_; + private: + Options options_; + std::string dbname_; + bool deletions_enabled_; + mutable SequenceNumber sequence_number_; +}; // DummyDB + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* t) : EnvWrapper(t) {} + + class DummySequentialFile : public SequentialFile { + public: + DummySequentialFile() : SequentialFile(), rnd_(5) {} + virtual Status Read(size_t n, Slice* result, char* scratch) { + size_t read_size = (n > size_left) ? size_left : n; + for (size_t i = 0; i < read_size; ++i) { + scratch[i] = rnd_.Next() & 255; + } + *result = Slice(scratch, read_size); + size_left -= read_size; + return Status::OK(); + } + + virtual Status Skip(uint64_t n) { + size_left = (n > size_left) ? size_left - n : 0; + return Status::OK(); + } + private: + size_t size_left = 200; + Random rnd_; + }; + + Status NewSequentialFile(const std::string& f, + unique_ptr* r, + const EnvOptions& options) { + if (dummy_sequential_file_) { + r->reset(new TestEnv::DummySequentialFile()); + return Status::OK(); + } else { + return EnvWrapper::NewSequentialFile(f, r, options); + } + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& options) { + written_files_.push_back(f); + if (limit_written_files_ <= 0) { + return Status::NotSupported("Sorry, can't do this"); + } + limit_written_files_--; + return EnvWrapper::NewWritableFile(f, r, options); + } + + void AssertWrittenFiles(std::vector& should_have_written) { + sort(should_have_written.begin(), should_have_written.end()); + sort(written_files_.begin(), written_files_.end()); + ASSERT_TRUE(written_files_ == should_have_written); + } + + void ClearWrittenFiles() { + written_files_.clear(); + } + + void SetLimitWrittenFiles(uint64_t limit) { + limit_written_files_ = limit; + } + + void SetDummySequentialFile(bool dummy_sequential_file) { + dummy_sequential_file_ = dummy_sequential_file; + } + + private: + bool dummy_sequential_file_ = false; + std::vector written_files_; + uint64_t limit_written_files_ = 1000000; +}; // TestEnv + +class FileManager : public EnvWrapper { + public: + explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} + + Status DeleteRandomFileInDir(const std::string dir) { + std::vector children; + GetChildren(dir, &children); + if (children.size() <= 2) { // . and .. + return Status::NotFound(""); + } + while (true) { + int i = rnd_.Next() % children.size(); + if (children[i] != "." && children[i] != "..") { + return DeleteFile(dir + "/" + children[i]); + } + } + // should never get here + assert(false); + return Status::NotFound(""); + } + + Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) { + uint64_t size; + Status s = GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + s = NewRandomRWFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + + for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { + std::string tmp; + // write one random byte to a random position + s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + } + return s; + } + + Status CorruptChecksum(const std::string& fname, bool appear_valid) { + std::string metadata; + Status s = ReadFileToString(this, fname, &metadata); + if (!s.ok()) { + return s; + } + s = DeleteFile(fname); + if (!s.ok()) { + return s; + } + + auto pos = metadata.find("private"); + if (pos == std::string::npos) { + return Status::Corruption("private file is expected"); + } + pos = metadata.find(" crc32 ", pos + 6); + if (pos == std::string::npos) { + return Status::Corruption("checksum not found"); + } + + if (metadata.size() < pos + 7) { + return Status::Corruption("bad CRC32 checksum value"); + } + + if (appear_valid) { + if (metadata[pos + 8] == '\n') { + // single digit value, safe to insert one more digit + metadata.insert(pos + 8, 1, '0'); + } else { + metadata.erase(pos + 8, 1); + } + } else { + metadata[pos + 7] = 'a'; + } + + return WriteToFile(fname, metadata); + } + + Status WriteToFile(const std::string& fname, const std::string& data) { + unique_ptr file; + EnvOptions env_options; + env_options.use_mmap_writes = false; + Status s = EnvWrapper::NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + return file->Append(Slice(data)); + } + + private: + Random rnd_; +}; // FileManager + +// utility functions +static size_t FillDB(DB* db, int from, int to) { + size_t bytes_written = 0; + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + bytes_written += key.size() + value.size(); + + ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value))); + } + return bytes_written; +} + +static void AssertExists(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value; + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_EQ(value, "testvalue" + std::to_string(i)); + } +} + +static void AssertEmpty(DB* db, int from, int to) { + for (int i = from; i < to; ++i) { + std::string key = "testkey" + std::to_string(i); + std::string value = "testvalue" + std::to_string(i); + + Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_TRUE(s.IsNotFound()); + } +} + +class BackupableDBTest { + public: + BackupableDBTest() { + // set up files + dbname_ = test::TmpDir() + "/backupable_db"; + backupdir_ = test::TmpDir() + "/backupable_db_backup"; + + // set up envs + env_ = Env::Default(); + test_db_env_.reset(new TestEnv(env_)); + test_backup_env_.reset(new TestEnv(env_)); + file_manager_.reset(new FileManager(env_)); + + // set up db options + options_.create_if_missing = true; + options_.paranoid_checks = true; + options_.write_buffer_size = 1 << 17; // 128KB + options_.env = test_db_env_.get(); + options_.wal_dir = dbname_; + // set up backup db options + CreateLoggerFromOptions(dbname_, backupdir_, env_, + Options(), &logger_); + backupable_options_.reset(new BackupableDBOptions( + backupdir_, test_backup_env_.get(), true, logger_.get(), true)); + + // delete old files in db + DestroyDB(dbname_, Options()); + } + + DB* OpenDB() { + DB* db; + ASSERT_OK(DB::Open(options_, dbname_, &db)); + return db; + } + + void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false, + bool share_table_files = true) { + // reset all the defaults + test_backup_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetLimitWrittenFiles(1000000); + test_db_env_->SetDummySequentialFile(dummy); + + DB* db; + if (dummy) { + dummy_db_ = new DummyDB(options_, dbname_); + db = dummy_db_; + } else { + ASSERT_OK(DB::Open(options_, dbname_, &db)); + } + backupable_options_->destroy_old_data = destroy_old_data; + backupable_options_->share_table_files = share_table_files; + db_.reset(new BackupableDB(db, *backupable_options_)); + } + + void CloseBackupableDB() { + db_.reset(nullptr); + } + + void OpenRestoreDB() { + backupable_options_->destroy_old_data = false; + restore_db_.reset( + new RestoreBackupableDB(test_db_env_.get(), *backupable_options_)); + } + + void CloseRestoreDB() { + restore_db_.reset(nullptr); + } + + // restores backup backup_id and asserts the existence of + // [start_exist, end_exist> and not-existence of + // [end_exist, end> + // + // if backup_id == 0, it means restore from latest + // if end == 0, don't check AssertEmpty + void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist, + uint32_t end_exist, uint32_t end = 0, + bool keep_log_files = false) { + RestoreOptions restore_options(keep_log_files); + bool opened_restore = false; + if (restore_db_.get() == nullptr) { + opened_restore = true; + OpenRestoreDB(); + } + if (backup_id > 0) { + ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_, + restore_options)); + } else { + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_, + restore_options)); + } + DB* db = OpenDB(); + AssertExists(db, start_exist, end_exist); + if (end != 0) { + AssertEmpty(db, end_exist, end); + } + delete db; + if (opened_restore) { + CloseRestoreDB(); + } + } + + // files + std::string dbname_; + std::string backupdir_; + + // envs + Env* env_; + unique_ptr test_db_env_; + unique_ptr test_backup_env_; + unique_ptr file_manager_; + + // all the dbs! + DummyDB* dummy_db_; // BackupableDB owns dummy_db_ + unique_ptr db_; + unique_ptr restore_db_; + + // options + Options options_; + unique_ptr backupable_options_; + std::shared_ptr logger_; +}; // BackupableDBTest + +void AppendPath(const std::string& path, std::vector& v) { + for (auto& f : v) { + f = path + f; + } +} + +// this will make sure that backup does not copy the same file twice +TEST(BackupableDBTest, NoDoubleCopy) { + OpenBackupableDB(true, true); + + // should write 5 DB files + LATEST_BACKUP + one meta file + test_backup_env_->SetLimitWrittenFiles(7); + test_backup_env_->ClearWrittenFiles(); + test_db_env_->SetLimitWrittenFiles(0); + dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + std::vector should_have_written = { + "/shared/00010.sst.tmp", + "/shared/00011.sst.tmp", + "/private/1.tmp/CURRENT", + "/private/1.tmp/MANIFEST-01", + "/private/1.tmp/00011.log", + "/meta/1.tmp", + "/LATEST_BACKUP.tmp" + }; + AppendPath(dbname_ + "_backup", should_have_written); + test_backup_env_->AssertWrittenFiles(should_have_written); + + // should write 4 new DB files + LATEST_BACKUP + one meta file + // should not write/copy 00010.sst, since it's already there! + test_backup_env_->SetLimitWrittenFiles(6); + test_backup_env_->ClearWrittenFiles(); + dummy_db_->live_files_ = { "/00010.sst", "/00015.sst", + "/CURRENT", "/MANIFEST-01" }; + dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + ASSERT_OK(db_->CreateNewBackup(false)); + // should not open 00010.sst - it's already there + should_have_written = { + "/shared/00015.sst.tmp", + "/private/2.tmp/CURRENT", + "/private/2.tmp/MANIFEST-01", + "/private/2.tmp/00011.log", + "/meta/2.tmp", + "/LATEST_BACKUP.tmp" + }; + AppendPath(dbname_ + "_backup", should_have_written); + test_backup_env_->AssertWrittenFiles(should_have_written); + + ASSERT_OK(db_->DeleteBackup(1)); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); + // 00011.sst was only in backup 1, should be deleted + ASSERT_EQ(false, + test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_EQ(true, + test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + + // MANIFEST file size should be only 100 + uint64_t size; + test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size); + ASSERT_EQ(100UL, size); + test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size); + ASSERT_EQ(200UL, size); + + CloseBackupableDB(); +} + +// test various kind of corruptions that may happen: +// 1. Not able to write a file for backup - that backup should fail, +// everything else should work +// 2. Corrupted/deleted LATEST_BACKUP - everything should work fine +// 3. Corrupted backup meta file or missing backuped file - we should +// not be able to open that backup, but all other backups should be +// fine +// 4. Corrupted checksum value - if the checksum is not a valid uint32_t, +// db open should fail, otherwise, it aborts during the restore process. +TEST(BackupableDBTest, CorruptionsTest) { + const int keys_iteration = 5000; + Random rnd(6); + Status s; + + OpenBackupableDB(true); + // create five backups + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + + // ---------- case 1. - fail a write ----------- + // try creating backup 6, but fail a write + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + test_backup_env_->SetLimitWrittenFiles(2); + // should fail + s = db_->CreateNewBackup(!!(rnd.Next() % 2)); + ASSERT_TRUE(!s.ok()); + test_backup_env_->SetLimitWrittenFiles(1000000); + // latest backup should have all the keys + CloseBackupableDB(); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + + // ---------- case 2. - corrupt/delete latest backup ----------- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/LATEST_BACKUP", 2)); + AssertBackupConsistency(0, 0, keys_iteration * 5); + ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP")); + AssertBackupConsistency(0, 0, keys_iteration * 5); + // create backup 6, point LATEST_BACKUP to 5 + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6); + ASSERT_OK(db_->CreateNewBackup(false)); + CloseBackupableDB(); + ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5")); + AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6); + // assert that all 6 data is gone! + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false); + + // --------- case 3. corrupted backup meta or missing backuped file ---- + ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3)); + // since 5 meta is now corrupted, latest backup should be 4 + AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + CloseRestoreDB(); + ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4")); + // 4 is corrupted, 3 is the latest backup now + AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5); + OpenRestoreDB(); + s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_); + CloseRestoreDB(); + ASSERT_TRUE(!s.ok()); + + // --------- case 4. corrupted checksum value ---- + ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false)); + // checksum of backup 3 is an invalid value, this can be detected at + // db open time, and it reverts to the previous backup automatically + AssertBackupConsistency(0, 0, keys_iteration * 2, keys_iteration * 5); + // checksum of the backup 2 appears to be valid, this can cause checksum + // mismatch and abort restore process + ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true)); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2")); + OpenRestoreDB(); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2")); + s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5); + + // new backup should be 2! + OpenBackupableDB(); + FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2); + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + CloseBackupableDB(); + AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5); +} + +// open DB, write, close DB, backup, restore, repeat +TEST(BackupableDBTest, OfflineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + // first iter -- flush before backup + // second iter -- don't flush before backup + for (int iter = 0; iter < 2; ++iter) { + // delete old data + DestroyDB(dbname_, Options()); + bool destroy_data = true; + + // every iteration -- + // 1. insert new data in the DB + // 2. backup the DB + // 3. destroy the db + // 4. restore the db, check everything is still there + for (int i = 0; i < 5; ++i) { + // in last iteration, put smaller amount of data, + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + // ---- insert new data and back up ---- + OpenBackupableDB(destroy_data); + destroy_data = false; + FillDB(db_.get(), keys_iteration * i, fill_up_to); + ASSERT_OK(db_->CreateNewBackup(iter == 0)); + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, fill_up_to); + delete db; + + // ---- restore the DB ---- + OpenRestoreDB(); + if (i >= 3) { // test purge old backups + // when i == 4, purge to only 1 backup + // when i == 3, purge to 2 backups + ASSERT_OK(restore_db_->PurgeOldBackups(5 - i)); + } + // ---- make sure the data is there --- + AssertBackupConsistency(0, 0, fill_up_to, max_key); + CloseRestoreDB(); + } + } +} + +// open DB, write, backup, write, backup, close, restore +TEST(BackupableDBTest, OnlineIntegrationTest) { + // has to be a big number, so that it triggers the memtable flush + const int keys_iteration = 5000; + const int max_key = keys_iteration * 4 + 10; + Random rnd(7); + // delete old data + DestroyDB(dbname_, Options()); + + OpenBackupableDB(true); + // write some data, backup, repeat + for (int i = 0; i < 5; ++i) { + if (i == 4) { + // delete backup number 2, online delete! + OpenRestoreDB(); + ASSERT_OK(restore_db_->DeleteBackup(2)); + CloseRestoreDB(); + } + // in last iteration, put smaller amount of data, + // so that backups can share sst files + int fill_up_to = std::min(keys_iteration * (i + 1), max_key); + FillDB(db_.get(), keys_iteration * i, fill_up_to); + // we should get consistent results with flush_before_backup + // set to both true and false + ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2))); + } + // close and destroy + CloseBackupableDB(); + DestroyDB(dbname_, Options()); + + // ---- make sure it's empty ---- + DB* db = OpenDB(); + AssertEmpty(db, 0, max_key); + delete db; + + // ---- restore every backup and verify all the data is there ---- + OpenRestoreDB(); + for (int i = 1; i <= 5; ++i) { + if (i == 2) { + // we deleted backup 2 + Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); + ASSERT_TRUE(!s.ok()); + } else { + int fill_up_to = std::min(keys_iteration * i, max_key); + AssertBackupConsistency(i, 0, fill_up_to, max_key); + } + } + + // delete some backups -- this should leave only backups 3 and 5 alive + ASSERT_OK(restore_db_->DeleteBackup(4)); + ASSERT_OK(restore_db_->PurgeOldBackups(2)); + + std::vector backup_info; + restore_db_->GetBackupInfo(&backup_info); + ASSERT_EQ(2UL, backup_info.size()); + + // check backup 3 + AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key); + // check backup 5 + AssertBackupConsistency(5, 0, max_key); + + CloseRestoreDB(); +} + +TEST(BackupableDBTest, FailOverwritingBackups) { + options_.write_buffer_size = 1024 * 1024 * 1024; // 1GB + // create backups 1, 2, 3, 4, 5 + OpenBackupableDB(true); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), 100 * i, 100 * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(true)); + CloseBackupableDB(); + OpenBackupableDB(false); + } + CloseBackupableDB(); + + // restore 3 + OpenRestoreDB(); + ASSERT_OK(restore_db_->RestoreDBFromBackup(3, dbname_, dbname_)); + CloseRestoreDB(); + + OpenBackupableDB(false); + FillDB(db_.get(), 0, 300); + Status s = db_->CreateNewBackup(true); + // the new backup fails because new table files + // clash with old table files from backups 4 and 5 + // (since write_buffer_size is huge, we can be sure that + // each backup will generate only one sst file and that + // a file generated by a new backup is the same as + // sst file generated by backup 4) + ASSERT_TRUE(s.IsCorruption()); + ASSERT_OK(db_->DeleteBackup(4)); + ASSERT_OK(db_->DeleteBackup(5)); + // now, the backup can succeed + ASSERT_OK(db_->CreateNewBackup(true)); + CloseBackupableDB(); +} + +TEST(BackupableDBTest, NoShareTableFiles) { + const int keys_iteration = 5000; + OpenBackupableDB(true, false, false); + for (int i = 0; i < 5; ++i) { + FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1)); + ASSERT_OK(db_->CreateNewBackup(!!(i % 2))); + } + CloseBackupableDB(); + + for (int i = 0; i < 5; ++i) { + AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1), + keys_iteration * 6); + } +} + +TEST(BackupableDBTest, DeleteTmpFiles) { + OpenBackupableDB(); + CloseBackupableDB(); + std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp"; + std::string private_tmp_dir = backupdir_ + "/private/10.tmp"; + std::string private_tmp_file = private_tmp_dir + "/00003.sst"; + file_manager_->WriteToFile(shared_tmp, "tmp"); + file_manager_->CreateDir(private_tmp_dir); + file_manager_->WriteToFile(private_tmp_file, "tmp"); + ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir)); + OpenBackupableDB(); + CloseBackupableDB(); + ASSERT_EQ(false, file_manager_->FileExists(shared_tmp)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file)); + ASSERT_EQ(false, file_manager_->FileExists(private_tmp_dir)); +} + +TEST(BackupableDBTest, KeepLogFiles) { + backupable_options_->backup_log_files = false; + // basically infinite + options_.WAL_ttl_seconds = 24 * 60 * 60; + OpenBackupableDB(true); + FillDB(db_.get(), 0, 100); + ASSERT_OK(db_->Flush(FlushOptions())); + FillDB(db_.get(), 100, 200); + ASSERT_OK(db_->CreateNewBackup(false)); + FillDB(db_.get(), 200, 300); + ASSERT_OK(db_->Flush(FlushOptions())); + FillDB(db_.get(), 300, 400); + ASSERT_OK(db_->Flush(FlushOptions())); + FillDB(db_.get(), 400, 500); + ASSERT_OK(db_->Flush(FlushOptions())); + CloseBackupableDB(); + + // all data should be there if we call with keep_log_files = true + AssertBackupConsistency(0, 0, 500, 600, true); +} + +TEST(BackupableDBTest, RateLimiting) { + uint64_t const KB = 1024 * 1024; + size_t const kMicrosPerSec = 1000 * 1000LL; + + std::vector> limits( + {{KB, 5 * KB}, {2 * KB, 3 * KB}}); + + for (const auto& limit : limits) { + // destroy old data + DestroyDB(dbname_, Options()); + + backupable_options_->backup_rate_limit = limit.first; + backupable_options_->restore_rate_limit = limit.second; + options_.compression = kNoCompression; + OpenBackupableDB(true); + size_t bytes_written = FillDB(db_.get(), 0, 100000); + + auto start_backup = env_->NowMicros(); + ASSERT_OK(db_->CreateNewBackup(false)); + auto backup_time = env_->NowMicros() - start_backup; + auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / + backupable_options_->backup_rate_limit; + ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); + ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time); + + CloseBackupableDB(); + + OpenRestoreDB(); + auto start_restore = env_->NowMicros(); + ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_)); + auto restore_time = env_->NowMicros() - start_restore; + CloseRestoreDB(); + auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / + backupable_options_->restore_rate_limit; + ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); + ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time); + + AssertBackupConsistency(0, 0, 100000, 100010); + } +} + +} // anon namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc new file mode 100644 index 00000000..095ecf8a --- /dev/null +++ b/utilities/geodb/geodb_impl.cc @@ -0,0 +1,427 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "utilities/geodb/geodb_impl.h" + +#define __STDC_FORMAT_MACROS + +#include +#include +#include +#include +#include "db/filename.h" +#include "util/coding.h" + +// +// There are two types of keys. The first type of key-values +// maps a geo location to the set of object ids and their values. +// Table 1 +// key : p + : + $quadkey + : + $id + +// : + $latitude + : + $longitude +// value : value of the object +// This table can be used to find all objects that reside near +// a specified geolocation. +// +// Table 2 +// key : 'k' + : + $id +// value: $quadkey + +namespace rocksdb { + +GeoDBImpl::GeoDBImpl(DB* db, const GeoDBOptions& options) : + GeoDB(db, options), db_(db), options_(options) { +} + +GeoDBImpl::~GeoDBImpl() { +} + +Status GeoDBImpl::Insert(const GeoObject& obj) { + WriteBatch batch; + + // It is possible that this id is already associated with + // with a different position. We first have to remove that + // association before we can insert the new one. + + // remove existing object, if it exists + GeoObject old; + Status status = GetById(obj.id, &old); + if (status.ok()) { + assert(obj.id.compare(old.id) == 0); + std::string quadkey = PositionToQuad(old.position, Detail); + std::string key1 = MakeKey1(old.position, old.id, quadkey); + std::string key2 = MakeKey2(old.id); + batch.Delete(Slice(key1)); + batch.Delete(Slice(key2)); + } else if (status.IsNotFound()) { + // What if another thread is trying to insert the same ID concurrently? + } else { + return status; + } + + // insert new object + std::string quadkey = PositionToQuad(obj.position, Detail); + std::string key1 = MakeKey1(obj.position, obj.id, quadkey); + std::string key2 = MakeKey2(obj.id); + batch.Put(Slice(key1), Slice(obj.value)); + batch.Put(Slice(key2), Slice(quadkey)); + return db_->Write(woptions_, &batch); +} + +Status GeoDBImpl::GetByPosition(const GeoPosition& pos, + const Slice& id, + std::string* value) { + std::string quadkey = PositionToQuad(pos, Detail); + std::string key1 = MakeKey1(pos, id, quadkey); + return db_->Get(roptions_, Slice(key1), value); +} + +Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) { + Status status; + Slice quadkey; + + // create an iterator so that we can get a consistent picture + // of the database. + Iterator* iter = db_->NewIterator(roptions_); + + // create key for table2 + std::string kt = MakeKey2(id); + Slice key2(kt); + + iter->Seek(key2); + if (iter->Valid() && iter->status().ok()) { + if (iter->key().compare(key2) == 0) { + quadkey = iter->value(); + } + } + if (quadkey.size() == 0) { + delete iter; + return Status::NotFound(key2); + } + + // + // Seek to the quadkey + id prefix + // + std::string prefix = MakeKey1Prefix(quadkey.ToString(), id); + iter->Seek(Slice(prefix)); + assert(iter->Valid()); + if (!iter->Valid() || !iter->status().ok()) { + delete iter; + return Status::NotFound(); + } + + // split the key into p + quadkey + id + lat + lon + std::vector parts; + Slice key = iter->key(); + StringSplit(&parts, key.ToString(), ':'); + assert(parts.size() == 5); + assert(parts[0] == "p"); + assert(parts[1] == quadkey); + assert(parts[2] == id); + + // fill up output parameters + object->position.latitude = atof(parts[3].c_str()); + object->position.longitude = atof(parts[4].c_str()); + object->id = id.ToString(); // this is redundant + object->value = iter->value().ToString(); + delete iter; + return Status::OK(); +} + + +Status GeoDBImpl::Remove(const Slice& id) { + // Read the object from the database + GeoObject obj; + Status status = GetById(id, &obj); + if (!status.ok()) { + return status; + } + + // remove the object by atomically deleting it from both tables + std::string quadkey = PositionToQuad(obj.position, Detail); + std::string key1 = MakeKey1(obj.position, obj.id, quadkey); + std::string key2 = MakeKey2(obj.id); + WriteBatch batch; + batch.Delete(Slice(key1)); + batch.Delete(Slice(key2)); + return db_->Write(woptions_, &batch); +} + +Status GeoDBImpl::SearchRadial(const GeoPosition& pos, + double radius, + std::vector* values, + int number_of_values) { + // Gather all bounding quadkeys + std::vector qids; + Status s = searchQuadIds(pos, radius, &qids); + if (!s.ok()) { + return s; + } + + // create an iterator + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Process each prospective quadkey + for (std::string qid : qids) { + // The user is interested in only these many objects. + if (number_of_values == 0) { + break; + } + + // convert quadkey to db key prefix + std::string dbkey = MakeQuadKeyPrefix(qid); + + for (iter->Seek(dbkey); + number_of_values > 0 && iter->Valid() && iter->status().ok(); + iter->Next()) { + // split the key into p + quadkey + id + lat + lon + std::vector parts; + Slice key = iter->key(); + StringSplit(&parts, key.ToString(), ':'); + assert(parts.size() == 5); + assert(parts[0] == "p"); + std::string* quadkey = &parts[1]; + + // If the key we are looking for is a prefix of the key + // we found from the database, then this is one of the keys + // we are looking for. + auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin()); + if (res.first == qid.end()) { + GeoPosition pos(atof(parts[3].c_str()), atof(parts[4].c_str())); + GeoObject obj(pos, parts[4], iter->value().ToString()); + values->push_back(obj); + number_of_values--; + } else { + break; + } + } + } + delete iter; + return Status::OK(); +} + +std::string GeoDBImpl::MakeKey1(const GeoPosition& pos, Slice id, + std::string quadkey) { + std::string lat = std::to_string(pos.latitude); + std::string lon = std::to_string(pos.longitude); + std::string key = "p:"; + key.reserve(5 + quadkey.size() + id.size() + lat.size() + lon.size()); + key.append(quadkey); + key.append(":"); + key.append(id.ToString()); + key.append(":"); + key.append(lat); + key.append(":"); + key.append(lon); + return key; +} + +std::string GeoDBImpl::MakeKey2(Slice id) { + std::string key = "k:"; + key.append(id.ToString()); + return key; +} + +std::string GeoDBImpl::MakeKey1Prefix(std::string quadkey, + Slice id) { + std::string key = "p:"; + key.reserve(3 + quadkey.size() + id.size()); + key.append(quadkey); + key.append(":"); + key.append(id.ToString()); + return key; +} + +std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) { + std::string key = "p:"; + key.append(quadkey); + return key; +} + +void GeoDBImpl::StringSplit(std::vector* tokens, + const std::string &text, char sep) { + std::size_t start = 0, end = 0; + while ((end = text.find(sep, start)) != std::string::npos) { + tokens->push_back(text.substr(start, end - start)); + start = end + 1; + } + tokens->push_back(text.substr(start)); +} + +// convert degrees to radians +double GeoDBImpl::radians(double x) { + return (x * PI) / 180; +} + +// convert radians to degrees +double GeoDBImpl::degrees(double x) { + return (x * 180) / PI; +} + +// convert a gps location to quad coordinate +std::string GeoDBImpl::PositionToQuad(const GeoPosition& pos, + int levelOfDetail) { + Pixel p = PositionToPixel(pos, levelOfDetail); + Tile tile = PixelToTile(p); + return TileToQuadKey(tile, levelOfDetail); +} + +GeoPosition GeoDBImpl::displaceLatLon(double lat, double lon, + double deltay, double deltax) { + double dLat = deltay / EarthRadius; + double dLon = deltax / (EarthRadius * cos(radians(lat))); + return GeoPosition(lat + degrees(dLat), + lon + degrees(dLon)); +} + +// +// Return the distance between two positions on the earth +// +double GeoDBImpl::distance(double lat1, double lon1, + double lat2, double lon2) { + double lon = radians(lon2 - lon1); + double lat = radians(lat2 - lat1); + + double a = (sin(lat / 2) * sin(lat / 2)) + + cos(radians(lat1)) * cos(radians(lat2)) * + (sin(lon / 2) * sin(lon / 2)); + double angle = 2 * atan2(sqrt(a), sqrt(1 - a)); + return angle * EarthRadius; +} + +// +// Returns all the quadkeys inside the search range +// +Status GeoDBImpl::searchQuadIds(const GeoPosition& position, + double radius, + std::vector* quadKeys) { + // get the outline of the search square + GeoPosition topLeftPos = boundingTopLeft(position, radius); + GeoPosition bottomRightPos = boundingBottomRight(position, radius); + + Pixel topLeft = PositionToPixel(topLeftPos, Detail); + Pixel bottomRight = PositionToPixel(bottomRightPos, Detail); + + // how many level of details to look for + int numberOfTilesAtMaxDepth = floor((bottomRight.x - topLeft.x) / 256); + int zoomLevelsToRise = floor(log(numberOfTilesAtMaxDepth) / log(2)); + zoomLevelsToRise++; + int levels = std::max(0, Detail - zoomLevelsToRise); + + quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude, + topLeftPos.longitude), + levels)); + quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude, + bottomRightPos.longitude), + levels)); + quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude, + topLeftPos.longitude), + levels)); + quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude, + bottomRightPos.longitude), + levels)); + return Status::OK(); +} + +// Determines the ground resolution (in meters per pixel) at a specified +// latitude and level of detail. +// Latitude (in degrees) at which to measure the ground resolution. +// Level of detail, from 1 (lowest detail) to 23 (highest detail). +// Returns the ground resolution, in meters per pixel. +double GeoDBImpl::GroundResolution(double latitude, int levelOfDetail) { + latitude = clip(latitude, MinLatitude, MaxLatitude); + return cos(latitude * PI / 180) * 2 * PI * EarthRadius / + MapSize(levelOfDetail); +} + +// Converts a point from latitude/longitude WGS-84 coordinates (in degrees) +// into pixel XY coordinates at a specified level of detail. +GeoDBImpl::Pixel GeoDBImpl::PositionToPixel(const GeoPosition& pos, + int levelOfDetail) { + double latitude = clip(pos.latitude, MinLatitude, MaxLatitude); + double x = (pos.longitude + 180) / 360; + double sinLatitude = sin(latitude * PI / 180); + double y = 0.5 - log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI); + double mapSize = MapSize(levelOfDetail); + double X = floor(clip(x * mapSize + 0.5, 0, mapSize - 1)); + double Y = floor(clip(y * mapSize + 0.5, 0, mapSize - 1)); + return Pixel((unsigned int)X, (unsigned int)Y); +} + +GeoPosition GeoDBImpl::PixelToPosition(const Pixel& pixel, int levelOfDetail) { + double mapSize = MapSize(levelOfDetail); + double x = (clip(pixel.x, 0, mapSize - 1) / mapSize) - 0.5; + double y = 0.5 - (clip(pixel.y, 0, mapSize - 1) / mapSize); + double latitude = 90 - 360 * atan(exp(-y * 2 * PI)) / PI; + double longitude = 360 * x; + return GeoPosition(latitude, longitude); +} + +// Converts a Pixel to a Tile +GeoDBImpl::Tile GeoDBImpl::PixelToTile(const Pixel& pixel) { + unsigned int tileX = floor(pixel.x / 256); + unsigned int tileY = floor(pixel.y / 256); + return Tile(tileX, tileY); +} + +GeoDBImpl::Pixel GeoDBImpl::TileToPixel(const Tile& tile) { + unsigned int pixelX = tile.x * 256; + unsigned int pixelY = tile.y * 256; + return Pixel(pixelX, pixelY); +} + +// Convert a Tile to a quadkey +std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) { + std::stringstream quadKey; + for (int i = levelOfDetail; i > 0; i--) { + char digit = '0'; + int mask = 1 << (i - 1); + if ((tile.x & mask) != 0) { + digit++; + } + if ((tile.y & mask) != 0) { + digit++; + digit++; + } + quadKey << digit; + } + return quadKey.str(); +} + +// +// Convert a quadkey to a tile and its level of detail +// +void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile, + int *levelOfDetail) { + tile->x = tile->y = 0; + *levelOfDetail = quadkey.size(); + const char* key = reinterpret_cast(quadkey.c_str()); + for (int i = *levelOfDetail; i > 0; i--) { + int mask = 1 << (i - 1); + switch (key[*levelOfDetail - i]) { + case '0': + break; + + case '1': + tile->x |= mask; + break; + + case '2': + tile->y |= mask; + break; + + case '3': + tile->x |= mask; + tile->y |= mask; + break; + + default: + std::stringstream msg; + msg << quadkey; + msg << " Invalid QuadKey."; + throw std::runtime_error(msg.str()); + } + } +} +} // namespace rocksdb diff --git a/utilities/geodb/geodb_impl.h b/utilities/geodb/geodb_impl.h new file mode 100644 index 00000000..376a211c --- /dev/null +++ b/utilities/geodb/geodb_impl.h @@ -0,0 +1,187 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "utilities/geo_db.h" +#include "utilities/stackable_db.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +// A specific implementation of GeoDB + +class GeoDBImpl : public GeoDB { + public: + GeoDBImpl(DB* db, const GeoDBOptions& options); + ~GeoDBImpl(); + + // Associate the GPS location with the identified by 'id'. The value + // is a blob that is associated with this object. + virtual Status Insert(const GeoObject& object); + + // Retrieve the value of the object located at the specified GPS + // location and is identified by the 'id'. + virtual Status GetByPosition(const GeoPosition& pos, + const Slice& id, + std::string* value); + + // Retrieve the value of the object identified by the 'id'. This method + // could be potentially slower than GetByPosition + virtual Status GetById(const Slice& id, GeoObject* object); + + // Delete the specified object + virtual Status Remove(const Slice& id); + + // Returns a list of all items within a circular radius from the + // specified gps location + virtual Status SearchRadial(const GeoPosition& pos, + double radius, + std::vector* values, + int number_of_values); + + private: + DB* db_; + const GeoDBOptions options_; + const WriteOptions woptions_; + const ReadOptions roptions_; + + // The value of PI + static constexpr double PI = 3.141592653589793; + + // convert degrees to radians + static double radians(double x); + + // convert radians to degrees + static double degrees(double x); + + // A pixel class that captures X and Y coordinates + class Pixel { + public: + unsigned int x; + unsigned int y; + Pixel(unsigned int a, unsigned int b) : + x(a), y(b) { + } + }; + + // A Tile in the geoid + class Tile { + public: + unsigned int x; + unsigned int y; + Tile(unsigned int a, unsigned int b) : + x(a), y(b) { + } + }; + + // convert a gps location to quad coordinate + static std::string PositionToQuad(const GeoPosition& pos, int levelOfDetail); + + // arbitrary constant use for WGS84 via + // http://en.wikipedia.org/wiki/World_Geodetic_System + // http://mathforum.org/library/drmath/view/51832.html + // http://msdn.microsoft.com/en-us/library/bb259689.aspx + // http://www.tuicool.com/articles/NBrE73 + // + const int Detail = 23; + static constexpr double EarthRadius = 6378137; + static constexpr double MinLatitude = -85.05112878; + static constexpr double MaxLatitude = 85.05112878; + static constexpr double MinLongitude = -180; + static constexpr double MaxLongitude = 180; + + // clips a number to the specified minimum and maximum values. + static double clip(double n, double minValue, double maxValue) { + return fmin(fmax(n, minValue), maxValue); + } + + // Determines the map width and height (in pixels) at a specified level + // of detail, from 1 (lowest detail) to 23 (highest detail). + // Returns the map width and height in pixels. + static unsigned int MapSize(int levelOfDetail) { + return (unsigned int)(256 << levelOfDetail); + } + + // Determines the ground resolution (in meters per pixel) at a specified + // latitude and level of detail. + // Latitude (in degrees) at which to measure the ground resolution. + // Level of detail, from 1 (lowest detail) to 23 (highest detail). + // Returns the ground resolution, in meters per pixel. + static double GroundResolution(double latitude, int levelOfDetail); + + // Converts a point from latitude/longitude WGS-84 coordinates (in degrees) + // into pixel XY coordinates at a specified level of detail. + static Pixel PositionToPixel(const GeoPosition& pos, int levelOfDetail); + + static GeoPosition PixelToPosition(const Pixel& pixel, int levelOfDetail); + + // Converts a Pixel to a Tile + static Tile PixelToTile(const Pixel& pixel); + + static Pixel TileToPixel(const Tile& tile); + + // Convert a Tile to a quadkey + static std::string TileToQuadKey(const Tile& tile, int levelOfDetail); + + // Convert a quadkey to a tile and its level of detail + static void QuadKeyToTile(std::string quadkey, Tile* tile, + int *levelOfDetail); + + // Return the distance between two positions on the earth + static double distance(double lat1, double lon1, + double lat2, double lon2); + static GeoPosition displaceLatLon(double lat, double lon, + double deltay, double deltax); + + // + // Returns the top left position after applying the delta to + // the specified position + // + static GeoPosition boundingTopLeft(const GeoPosition& in, double radius) { + return displaceLatLon(in.latitude, in.longitude, -radius, -radius); + } + + // + // Returns the bottom right position after applying the delta to + // the specified position + static GeoPosition boundingBottomRight(const GeoPosition& in, + double radius) { + return displaceLatLon(in.latitude, in.longitude, radius, radius); + } + + // + // Get all quadkeys within a radius of a specified position + // + Status searchQuadIds(const GeoPosition& position, + double radius, + std::vector* quadKeys); + + // splits a string into its components + static void StringSplit(std::vector* tokens, + const std::string &text, + char sep); + + // + // Create keys for accessing rocksdb table(s) + // + static std::string MakeKey1(const GeoPosition& pos, + Slice id, + std::string quadkey); + static std::string MakeKey2(Slice id); + static std::string MakeKey1Prefix(std::string quadkey, + Slice id); + static std::string MakeQuadKeyPrefix(std::string quadkey); +}; + +} // namespace rocksdb diff --git a/utilities/geodb/geodb_test.cc b/utilities/geodb/geodb_test.cc new file mode 100644 index 00000000..49e72d9d --- /dev/null +++ b/utilities/geodb/geodb_test.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// +#include "utilities/geodb/geodb_impl.h" + +#include +#include "util/testharness.h" + +namespace rocksdb { + +class GeoDBTest { + public: + static const std::string kDefaultDbName; + static Options options; + DB* db; + GeoDB* geodb; + + GeoDBTest() { + GeoDBOptions geodb_options; + ASSERT_OK(DestroyDB(kDefaultDbName, options)); + options.create_if_missing = true; + Status status = DB::Open(options, kDefaultDbName, &db); + geodb = new GeoDBImpl(db, geodb_options); + } + + ~GeoDBTest() { + delete geodb; + } + + GeoDB* getdb() { + return geodb; + } +}; + +const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault/"; +Options GeoDBTest::options = Options(); + +// Insert, Get and Remove +TEST(GeoDBTest, SimpleTest) { + GeoPosition pos1(100, 101); + std::string id1("id1"); + std::string value1("value1"); + + // insert first object into database + GeoObject obj1(pos1, id1, value1); + Status status = getdb()->Insert(obj1); + ASSERT_TRUE(status.ok()); + + // insert second object into database + GeoPosition pos2(200, 201); + std::string id2("id2"); + std::string value2 = "value2"; + GeoObject obj2(pos2, id2, value2); + status = getdb()->Insert(obj2); + ASSERT_TRUE(status.ok()); + + // retrieve first object using position + std::string value; + status = getdb()->GetByPosition(pos1, Slice(id1), &value); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(value, value1); + + // retrieve first object using id + GeoObject obj; + status = getdb()->GetById(Slice(id1), &obj); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(obj.position.latitude, 100); + ASSERT_EQ(obj.position.longitude, 101); + ASSERT_EQ(obj.id.compare(id1), 0); + ASSERT_EQ(obj.value, value1); + + // delete first object + status = getdb()->Remove(Slice(id1)); + ASSERT_TRUE(status.ok()); + status = getdb()->GetByPosition(pos1, Slice(id1), &value); + ASSERT_TRUE(status.IsNotFound()); + status = getdb()->GetById(id1, &obj); + ASSERT_TRUE(status.IsNotFound()); + + // check that we can still find second object + status = getdb()->GetByPosition(pos2, id2, &value); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(value, value2); + status = getdb()->GetById(id2, &obj); + ASSERT_TRUE(status.ok()); +} + +// Search. +// Verify distances via http://www.stevemorse.org/nearest/distance.php +TEST(GeoDBTest, Search) { + GeoPosition pos1(45, 45); + std::string id1("mid1"); + std::string value1 = "midvalue1"; + + // insert object at 45 degree latitude + GeoObject obj1(pos1, id1, value1); + Status status = getdb()->Insert(obj1); + ASSERT_TRUE(status.ok()); + + // search all objects centered at 46 degree latitude with + // a radius of 200 kilometers. We should find the one object that + // we inserted earlier. + std::vector values; + status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(values.size(), 1U); + + // search all objects centered at 46 degree latitude with + // a radius of 2 kilometers. There should be none. + values.clear(); + status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values); + ASSERT_TRUE(status.ok()); + ASSERT_EQ(values.size(), 0U); +} + +} // namespace rocksdb + +int main(int argc, char* argv[]) { + return rocksdb::test::RunAllTests(); +} diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h new file mode 100644 index 00000000..fdf06645 --- /dev/null +++ b/utilities/merge_operators.h @@ -0,0 +1,45 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef MERGE_OPERATORS_H +#define MERGE_OPERATORS_H + +#include +#include + +#include "rocksdb/merge_operator.h" + +namespace rocksdb { + +class MergeOperators { + public: + static std::shared_ptr CreatePutOperator(); + static std::shared_ptr CreateUInt64AddOperator(); + static std::shared_ptr CreateStringAppendOperator(); + static std::shared_ptr CreateStringAppendTESTOperator(); + + // Will return a different merge operator depending on the string. + // TODO: Hook the "name" up to the actual Name() of the MergeOperators? + static std::shared_ptr CreateFromStringId( + const std::string& name) { + if (name == "put") { + return CreatePutOperator(); + } else if ( name == "uint64add") { + return CreateUInt64AddOperator(); + } else if (name == "stringappend") { + return CreateStringAppendOperator(); + } else if (name == "stringappendtest") { + return CreateStringAppendTESTOperator(); + } else { + // Empty or unknown, just return nullptr + return nullptr; + } + } + +}; + +} // namespace rocksdb + +#endif diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc new file mode 100644 index 00000000..33308431 --- /dev/null +++ b/utilities/merge_operators/put.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +using namespace rocksdb; + +namespace { // anonymous namespace + +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. +class PutOperator : public MergeOperator { + public: + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_sequence, + std::string* new_value, + Logger* logger) const override { + // Put basically only looks at the current/latest value + assert(!operand_sequence.empty()); + assert(new_value != nullptr); + new_value->assign(operand_sequence.back()); + return true; + } + + virtual bool PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const override { + new_value->assign(right_operand.data(), right_operand.size()); + return true; + } + + using MergeOperator::PartialMergeMulti; + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const + override { + new_value->assign(operand_list.back().data(), operand_list.back().size()); + return true; + } + + virtual const char* Name() const override { + return "PutOperator"; + } +}; + +} // end of anonymous namespace + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreatePutOperator() { + return std::make_shared(); +} + +} diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc new file mode 100644 index 00000000..38cd22eb --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -0,0 +1,60 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend.h" + +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// Constructor: also specify the delimiter character. +StringAppendOperator::StringAppendOperator(char delim_char) + : delim_(delim_char) { +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendOperator::Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const { + + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + if (!existing_value) { + // No existing_value. Set *new_value = value + new_value->assign(value.data(),value.size()); + } else { + // Generic append (existing_value != null). + // Reserve *new_value to correct size, and apply concatenation. + new_value->reserve(existing_value->size() + 1 + value.size()); + new_value->assign(existing_value->data(),existing_value->size()); + new_value->append(1,delim_); + new_value->append(value.data(), value.size()); + } + + return true; +} + +const char* StringAppendOperator::Name() const { + return "StringAppendOperator"; +} + +std::shared_ptr MergeOperators::CreateStringAppendOperator() { + return std::make_shared(','); +} + +} // namespace rocksdb + + + diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h new file mode 100644 index 00000000..ca5b97ec --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend.h @@ -0,0 +1,31 @@ +/** + * A MergeOperator for rocksdb that implements string append. + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class StringAppendOperator : public AssociativeMergeOperator { + public: + StringAppendOperator(char delim_char); /// Constructor: specify delimiter + + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override; + + virtual const char* Name() const override; + + private: + char delim_; // The delimiter is inserted between elements + +}; + +} // namespace rocksdb + diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc new file mode 100644 index 00000000..b2e03588 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend2.cc @@ -0,0 +1,113 @@ +/** + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "stringappend2.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +// Constructor: also specify the delimiter character. +StringAppendTESTOperator::StringAppendTESTOperator(char delim_char) + : delim_(delim_char) { +} + +// Implementation for the merge operation (concatenates two strings) +bool StringAppendTESTOperator::FullMerge( + const Slice& key, + const Slice* existing_value, + const std::deque& operands, + std::string* new_value, + Logger* logger) const { + + // Clear the *new_value for writing. + assert(new_value); + new_value->clear(); + + // Compute the space needed for the final result. + int numBytes = 0; + for(auto it = operands.begin(); it != operands.end(); ++it) { + numBytes += it->size() + 1; // Plus 1 for the delimiter + } + + // Only print the delimiter after the first entry has been printed + bool printDelim = false; + + // Prepend the *existing_value if one exists. + if (existing_value) { + new_value->reserve(numBytes + existing_value->size()); + new_value->append(existing_value->data(), existing_value->size()); + printDelim = true; + } else if (numBytes) { + new_value->reserve(numBytes-1); // Minus 1 since we have one less delimiter + } + + // Concatenate the sequence of strings (and add a delimiter between each) + for(auto it = operands.begin(); it != operands.end(); ++it) { + if (printDelim) { + new_value->append(1,delim_); + } + new_value->append(*it); + printDelim = true; + } + + return true; +} + +bool StringAppendTESTOperator::PartialMergeMulti( + const Slice& key, const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + return false; +} + +// A version of PartialMerge that actually performs "partial merging". +// Use this to simulate the exact behaviour of the StringAppendOperator. +bool StringAppendTESTOperator::_AssocPartialMergeMulti( + const Slice& key, const std::deque& operand_list, + std::string* new_value, Logger* logger) const { + // Clear the *new_value for writing + assert(new_value); + new_value->clear(); + assert(operand_list.size() >= 2); + + // Generic append + // Determine and reserve correct size for *new_value. + size_t size = 0; + for (const auto& operand : operand_list) { + size += operand.size(); + } + size += operand_list.size() - 1; // Delimiters + new_value->reserve(size); + + // Apply concatenation + new_value->assign(operand_list.front().data(), operand_list.front().size()); + + for (std::deque::const_iterator it = operand_list.begin() + 1; + it != operand_list.end(); ++it) { + new_value->append(1, delim_); + new_value->append(it->data(), it->size()); + } + + return true; +} + +const char* StringAppendTESTOperator::Name() const { + return "StringAppendTESTOperator"; +} + + +std::shared_ptr +MergeOperators::CreateStringAppendTESTOperator() { + return std::make_shared(','); +} + +} // namespace rocksdb + diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h new file mode 100644 index 00000000..5e506ef8 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend2.h @@ -0,0 +1,51 @@ +/** + * A TEST MergeOperator for rocksdb that implements string append. + * It is built using the MergeOperator interface rather than the simpler + * AssociativeMergeOperator interface. This is useful for testing/benchmarking. + * While the two operators are semantically the same, all production code + * should use the StringAppendOperator defined in stringappend.{h,cc}. The + * operator defined in the present file is primarily for testing. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include +#include + +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" + +namespace rocksdb { + +class StringAppendTESTOperator : public MergeOperator { + public: + // Constructor with delimiter + explicit StringAppendTESTOperator(char delim_char); + + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operand_sequence, + std::string* new_value, + Logger* logger) const override; + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const + override; + + virtual const char* Name() const override; + + private: + // A version of PartialMerge that actually performs "partial merging". + // Use this to simulate the exact behaviour of the StringAppendOperator. + bool _AssocPartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const; + + char delim_; // The delimiter is inserted between elements + +}; + +} // namespace rocksdb diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc new file mode 100644 index 00000000..81af6462 --- /dev/null +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -0,0 +1,593 @@ +/** + * An persistent map : key -> (list of strings), using rocksdb merge. + * This file is a test-harness / use-case for the StringAppendOperator. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook, Inc. +*/ + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" +#include "utilities/ttl/db_ttl.h" +#include "util/testharness.h" +#include "util/random.h" + +using namespace rocksdb; + +namespace rocksdb { + +// Path to the database on file system +const std::string kDbName = "/tmp/mergetestdb"; + +// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator +std::shared_ptr OpenNormalDb(char delim_char) { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new StringAppendOperator(delim_char)); + ASSERT_OK(DB::Open(options, kDbName, &db)); + return std::shared_ptr(db); +} + +// Open a TtlDB with a non-associative StringAppendTESTOperator +std::shared_ptr OpenTtlDb(char delim_char) { + StackableDB* db; + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new StringAppendTESTOperator(delim_char)); + ASSERT_OK(UtilityDB::OpenTtlDB(options, kDbName, &db, 123456)); + return std::shared_ptr(db); +} + +/// StringLists represents a set of string-lists, each with a key-index. +/// Supports Append(list, string) and Get(list) +class StringLists { + public: + + //Constructor: specifies the rocksdb db + /* implicit */ + StringLists(std::shared_ptr db) + : db_(db), + merge_option_(), + get_option_() { + assert(db); + } + + // Append string val onto the list defined by key; return true on success + bool Append(const std::string& key, const std::string& val){ + Slice valSlice(val.data(), val.size()); + auto s = db_->Merge(merge_option_, key, valSlice); + + if (s.ok()) { + return true; + } else { + std::cerr << "ERROR " << s.ToString() << std::endl; + return false; + } + } + + // Returns the list of strings associated with key (or "" if does not exist) + bool Get(const std::string& key, std::string* const result){ + assert(result != nullptr); // we should have a place to store the result + auto s = db_->Get(get_option_, key, result); + + if (s.ok()) { + return true; + } + + // Either key does not exist, or there is some error. + *result = ""; // Always return empty string (just for convention) + + //NotFound is okay; just return empty (similar to std::map) + //But network or db errors, etc, should fail the test (or at least yell) + if (!s.IsNotFound()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + } + + // Always return false if s.ok() was not true + return false; + } + + + private: + std::shared_ptr db_; + WriteOptions merge_option_; + ReadOptions get_option_; + +}; + + +// The class for unit-testing +class StringAppendOperatorTest { + public: + StringAppendOperatorTest() { + DestroyDB(kDbName, Options()); // Start each test with a fresh DB + } + + typedef std::shared_ptr (* OpenFuncPtr)(char); + + // Allows user to open databases with different configurations. + // e.g.: Can open a DB or a TtlDB, etc. + static void SetOpenDbFunction(OpenFuncPtr func) { + OpenDb = func; + } + + protected: + static OpenFuncPtr OpenDb; +}; +StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr; + +// THE TEST CASES BEGIN HERE + +TEST(StringAppendOperatorTest, IteratorTest) { + auto db_ = OpenDb(','); + StringLists slists(db_); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + slists.Append("k2", "a1"); + slists.Append("k2", "a2"); + slists.Append("k2", "a3"); + + std::string res; + std::unique_ptr it(db_->NewIterator(ReadOptions())); + std::string k1("k1"); + std::string k2("k2"); + bool first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + slists.Append("k2", "a4"); + slists.Append("k1", "v4"); + + // Snapshot should still be the same. Should ignore a4 and v4. + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3"); + } + } + + + // Should release the snapshot and be aware of the new stuff now + it.reset(db_->NewIterator(ReadOptions())); + first = true; + for (it->Seek(k1); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + // start from k2 this time. + for (it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "v1,v2,v3,v4"); + first = false; + } else { + ASSERT_EQ(res, "a1,a2,a3,a4"); + } + } + + slists.Append("k3", "g1"); + + it.reset(db_->NewIterator(ReadOptions())); + first = true; + std::string k3("k3"); + for(it->Seek(k2); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + for(it->Seek(k3); it->Valid(); it->Next()) { + res = it->value().ToString(); + if (first) { + // should not be hit + ASSERT_EQ(res, "a1,a2,a3,a4"); + first = false; + } else { + ASSERT_EQ(res, "g1"); + } + } + +} + +TEST(StringAppendOperatorTest, SimpleTest) { + auto db = OpenDb(','); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + bool status = slists.Get("k1", &res); + + ASSERT_TRUE(status); + ASSERT_EQ(res, "v1,v2,v3"); +} + +TEST(StringAppendOperatorTest, SimpleDelimiterTest) { + auto db = OpenDb('|'); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + slists.Get("k1", &res); + ASSERT_EQ(res, "v1|v2|v3"); +} + +TEST(StringAppendOperatorTest, OneValueNoDelimiterTest) { + auto db = OpenDb('!'); + StringLists slists(db); + + slists.Append("random_key", "single_val"); + + std::string res; + slists.Get("random_key", &res); + ASSERT_EQ(res, "single_val"); +} + +TEST(StringAppendOperatorTest, VariousKeys) { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + bool sa, sb, sc; + sa = slists.Get("a", &a); + sb = slists.Get("b", &b); + sc = slists.Get("c", &c); + + ASSERT_TRUE(sa && sb && sc); // All three keys should have been found + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); +} + +// Generate semi random keys/words from a small distribution. +TEST(StringAppendOperatorTest, RandomMixGetAppend) { + auto db = OpenDb(' '); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", + "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", + "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", + "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(1337); //deterministic seed; always get same results! + + const int kNumQueries = 30; + for (int q=0; q 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + + } + +} + +TEST(StringAppendOperatorTest, BIGRandomMixGetAppend) { + auto db = OpenDb(' '); + StringLists slists(db); + + // Generate a list of random keys and values + const int kWordCount = 15; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", + "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", + "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + const int kKeyCount = 6; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", + "shzassdianmd"}; + + // Will store a local copy of all data in order to verify correctness + std::map parallel_copy; + + // Generate a bunch of random queries (Append and Get)! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(9138204); // deterministic seed + + const int kNumQueries = 1000; + for (int q=0; q 0) { + parallel_copy[key] += " " + word; + } else { + parallel_copy[key] = word; + } + + } else if (query == GET_OP) { + // Assumes that a non-existent key just returns + std::string res; + slists.Get(key, &res); + ASSERT_EQ(res, parallel_copy[key]); + } + + } + +} + + +TEST(StringAppendOperatorTest, PersistentVariousKeys) { + // Perform the following operations in limited scope + { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "asdasd"); + slists.Append("a", "x"); + slists.Append("b", "y"); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + slists.Append("c", "asdasd"); + + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + + // The previous changes should be on disk (L0) + // The most recent changes should be in memory (MemTable) + // Hence, this will test both Get() paths. + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + + // All changes should be on disk. This will test VersionSet Get() + std::string a, b, c; + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + } +} + +TEST(StringAppendOperatorTest, PersistentFlushAndCompaction) { + // Perform the following operations in limited scope + { + auto db = OpenDb('\n'); + StringLists slists(db); + std::string a, b, c; + bool success; + + // Append, Flush, Get + slists.Append("c", "asdasd"); + db->Flush(rocksdb::FlushOptions()); + success = slists.Get("c", &c); + ASSERT_TRUE(success); + ASSERT_EQ(c, "asdasd"); + + // Append, Flush, Append, Get + slists.Append("a", "x"); + slists.Append("b", "y"); + db->Flush(rocksdb::FlushOptions()); + slists.Append("a", "t"); + slists.Append("a", "r"); + slists.Append("b", "2"); + + success = slists.Get("a", &a); + assert(success == true); + ASSERT_EQ(a, "x\nt\nr"); + + success = slists.Get("b", &b); + assert(success == true); + ASSERT_EQ(b, "y\n2"); + + // Append, Get + success = slists.Append("c", "asdasd"); + assert(success); + success = slists.Append("b", "monkey"); + assert(success); + + // I omit the "assert(success)" checks here. + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + + ASSERT_EQ(a, "x\nt\nr"); + ASSERT_EQ(b, "y\n2\nmonkey"); + ASSERT_EQ(c, "asdasd\nasdasd"); + } + + // Reopen the database (the previous changes should persist / be remembered) + { + auto db = OpenDb('\n'); + StringLists slists(db); + std::string a, b, c; + + // Get (Quick check for persistence of previous database) + slists.Get("a", &a); + ASSERT_EQ(a, "x\nt\nr"); + + //Append, Compact, Get + slists.Append("c", "bbnagnagsx"); + slists.Append("a", "sa"); + slists.Append("b", "df"); + db->CompactRange(nullptr, nullptr); + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + ASSERT_EQ(a, "x\nt\nr\nsa"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx"); + + // Append, Get + slists.Append("a", "gh"); + slists.Append("a", "jk"); + slists.Append("b", "l;"); + slists.Append("c", "rogosh"); + slists.Get("a", &a); + slists.Get("b", &b); + slists.Get("c", &c); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Compact, Get + db->CompactRange(nullptr, nullptr); + ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk"); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;"); + ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh"); + + // Append, Flush, Compact, Get + slists.Append("b", "afcg"); + db->Flush(rocksdb::FlushOptions()); + db->CompactRange(nullptr, nullptr); + slists.Get("b", &b); + ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg"); + } +} + +TEST(StringAppendOperatorTest, SimpleTestNullDelimiter) { + auto db = OpenDb('\0'); + StringLists slists(db); + + slists.Append("k1", "v1"); + slists.Append("k1", "v2"); + slists.Append("k1", "v3"); + + std::string res; + bool status = slists.Get("k1", &res); + ASSERT_TRUE(status); + + // Construct the desired string. Default constructor doesn't like '\0' chars. + std::string checker("v1,v2,v3"); // Verify that the string is right size. + checker[2] = '\0'; // Use null delimiter instead of comma. + checker[5] = '\0'; + assert(checker.size() == 8); // Verify it is still the correct size + + // Check that the rocksdb result string matches the desired string + assert(res.size() == checker.size()); + ASSERT_EQ(res, checker); +} + +} // namespace rocksdb + +int main(int arc, char** argv) { + // Run with regular database + { + fprintf(stderr, "Running tests with regular db and operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb); + rocksdb::test::RunAllTests(); + } + + // Run with TTL + { + fprintf(stderr, "Running tests with ttl db and generic operator.\n"); + StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb); + rocksdb::test::RunAllTests(); + } + + return 0; +} diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc new file mode 100644 index 00000000..9d78651e --- /dev/null +++ b/utilities/merge_operators/uint64add.cc @@ -0,0 +1,65 @@ +#include +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +using namespace rocksdb; + +namespace { // anonymous namespace + +// A 'model' merge operator with uint64 addition semantics +// Implemented as an AssociativeMergeOperator for simplicity and example. +class UInt64AddOperator : public AssociativeMergeOperator { + public: + virtual bool Merge(const Slice& key, + const Slice* existing_value, + const Slice& value, + std::string* new_value, + Logger* logger) const override { + uint64_t orig_value = 0; + if (existing_value){ + orig_value = DecodeInteger(*existing_value, logger); + } + uint64_t operand = DecodeInteger(value, logger); + + assert(new_value); + new_value->clear(); + PutFixed64(new_value, orig_value + operand); + + return true; // Return true always since corruption will be treated as 0 + } + + virtual const char* Name() const override { + return "UInt64AddOperator"; + } + + private: + // Takes the string and decodes it into a uint64_t + // On error, prints a message and returns 0 + uint64_t DecodeInteger(const Slice& value, Logger* logger) const { + uint64_t result = 0; + + if (value.size() == sizeof(uint64_t)) { + result = DecodeFixed64(value.data()); + } else if (logger != nullptr) { + // If value is corrupted, treat it as 0 + Log(logger, "uint64 value corruption, size: %zu > %zu", + value.size(), sizeof(uint64_t)); + } + + return result; + } + +}; + +} + +namespace rocksdb { + +std::shared_ptr MergeOperators::CreateUInt64AddOperator() { + return std::make_shared(); +} + +} diff --git a/utilities/redis/README b/utilities/redis/README new file mode 100644 index 00000000..8b17bc05 --- /dev/null +++ b/utilities/redis/README @@ -0,0 +1,14 @@ +This folder defines a REDIS-style interface for Rocksdb. +Right now it is written as a simple tag-on in the rocksdb::RedisLists class. +It implements Redis Lists, and supports only the "non-blocking operations". + +Internally, the set of lists are stored in a rocksdb database, mapping keys to +values. Each "value" is the list itself, storing a sequence of "elements". +Each element is stored as a 32-bit-integer, followed by a sequence of bytes. +The 32-bit-integer represents the length of the element (that is, the number +of bytes that follow). And then that many bytes follow. + + +NOTE: This README file may be old. See the actual redis_lists.cc file for +definitive details on the implementation. There should be a header at the top +of that file, explaining a bit of the implementation details. diff --git a/utilities/redis/redis_list_exception.h b/utilities/redis/redis_list_exception.h new file mode 100644 index 00000000..d409095a --- /dev/null +++ b/utilities/redis/redis_list_exception.h @@ -0,0 +1,20 @@ +/** + * A simple structure for exceptions in RedisLists. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once +#include + +namespace rocksdb { + +class RedisListException: public std::exception { + public: + const char* what() const throw() { + return "Invalid operation or corrupt data in Redis List."; + } +}; + +} // namespace rocksdb diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h new file mode 100644 index 00000000..d57f8ac9 --- /dev/null +++ b/utilities/redis/redis_list_iterator.h @@ -0,0 +1,308 @@ +/** + * RedisListIterator: + * An abstraction over the "list" concept (e.g.: for redis lists). + * Provides functionality to read, traverse, edit, and write these lists. + * + * Upon construction, the RedisListIterator is given a block of list data. + * Internally, it stores a pointer to the data and a pointer to current item. + * It also stores a "result" list that will be mutated over time. + * + * Traversal and mutation are done by "forward iteration". + * The Push() and Skip() methods will advance the iterator to the next item. + * However, Push() will also "write the current item to the result". + * Skip() will simply move to next item, causing current item to be dropped. + * + * Upon completion, the result (accessible by WriteResult()) will be saved. + * All "skipped" items will be gone; all "pushed" items will remain. + * + * @throws Any of the operations may throw a RedisListException if an invalid + * operation is performed or if the data is found to be corrupt. + * + * @notes By default, if WriteResult() is called part-way through iteration, + * it will automatically advance the iterator to the end, and Keep() + * all items that haven't been traversed yet. This may be subject + * to review. + * + * @notes Can access the "current" item via GetCurrent(), and other + * list-specific information such as Length(). + * + * @notes The internal representation is due to change at any time. Presently, + * the list is represented as follows: + * - 32-bit integer header: the number of items in the list + * - For each item: + * - 32-bit int (n): the number of bytes representing this item + * - n bytes of data: the actual data. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once + +#include + +#include "redis_list_exception.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb { + +/// An abstraction over the "list" concept. +/// All operations may throw a RedisListException +class RedisListIterator { + public: + /// Construct a redis-list-iterator based on data. + /// If the data is non-empty, it must formatted according to @notes above. + /// + /// If the data is valid, we can assume the following invariant(s): + /// a) length_, num_bytes_ are set correctly. + /// b) cur_byte_ always refers to the start of the current element, + /// just before the bytes that specify element length. + /// c) cur_elem_ is always the index of the current element. + /// d) cur_elem_length_ is always the number of bytes in current element, + /// excluding the 4-byte header itself. + /// e) result_ will always contain data_[0..cur_byte_) and a header + /// f) Whenever corrupt data is encountered or an invalid operation is + /// attempted, a RedisListException will immediately be thrown. + RedisListIterator(const std::string& list_data) + : data_(list_data.data()), + num_bytes_(list_data.size()), + cur_byte_(0), + cur_elem_(0), + cur_elem_length_(0), + length_(0), + result_() { + + // Initialize the result_ (reserve enough space for header) + InitializeResult(); + + // Parse the data only if it is not empty. + if (num_bytes_ == 0) { + return; + } + + // If non-empty, but less than 4 bytes, data must be corrupt + if (num_bytes_ < sizeof(length_)) { + ThrowError("Corrupt header."); // Will break control flow + } + + // Good. The first bytes specify the number of elements + length_ = DecodeFixed32(data_); + cur_byte_ = sizeof(length_); + + // If we have at least one element, point to that element. + // Also, read the first integer of the element (specifying the size), + // if possible. + if (length_ > 0) { + if (cur_byte_ + sizeof(cur_elem_length_) <= num_bytes_) { + cur_elem_length_ = DecodeFixed32(data_+cur_byte_); + } else { + ThrowError("Corrupt data for first element."); + } + } + + // At this point, we are fully set-up. + // The invariants described in the header should now be true. + } + + /// Reserve some space for the result_. + /// Equivalent to result_.reserve(bytes). + void Reserve(int bytes) { + result_.reserve(bytes); + } + + /// Go to next element in data file. + /// Also writes the current element to result_. + RedisListIterator& Push() { + WriteCurrentElement(); + MoveNext(); + return *this; + } + + /// Go to next element in data file. + /// Drops/skips the current element. It will not be written to result_. + RedisListIterator& Skip() { + MoveNext(); + --length_; // One less item + --cur_elem_; // We moved one forward, but index did not change + return *this; + } + + /// Insert elem into the result_ (just BEFORE the current element / byte) + /// Note: if Done() (i.e.: iterator points to end), this will append elem. + void InsertElement(const Slice& elem) { + // Ensure we are in a valid state + CheckErrors(); + + const int kOrigSize = result_.size(); + result_.resize(kOrigSize + SizeOf(elem)); + EncodeFixed32(result_.data() + kOrigSize, elem.size()); + memcpy(result_.data() + kOrigSize + sizeof(uint32_t), + elem.data(), + elem.size()); + ++length_; + ++cur_elem_; + } + + /// Access the current element, and save the result into *curElem + void GetCurrent(Slice* curElem) { + // Ensure we are in a valid state + CheckErrors(); + + // Ensure that we are not past the last element. + if (Done()) { + ThrowError("Invalid dereferencing."); + } + + // Dereference the element + *curElem = Slice(data_+cur_byte_+sizeof(cur_elem_length_), + cur_elem_length_); + } + + // Number of elements + int Length() const { + return length_; + } + + // Number of bytes in the final representation (i.e: WriteResult().size()) + int Size() const { + // result_ holds the currently written data + // data_[cur_byte..num_bytes-1] is the remainder of the data + return result_.size() + (num_bytes_ - cur_byte_); + } + + // Reached the end? + bool Done() const { + return cur_byte_ >= num_bytes_ || cur_elem_ >= length_; + } + + /// Returns a string representing the final, edited, data. + /// Assumes that all bytes of data_ in the range [0,cur_byte_) have been read + /// and that result_ contains this data. + /// The rest of the data must still be written. + /// So, this method ADVANCES THE ITERATOR TO THE END before writing. + Slice WriteResult() { + CheckErrors(); + + // The header should currently be filled with dummy data (0's) + // Correctly update the header. + // Note, this is safe since result_ is a vector (guaranteed contiguous) + EncodeFixed32(&result_[0],length_); + + // Append the remainder of the data to the result. + result_.insert(result_.end(),data_+cur_byte_, data_ +num_bytes_); + + // Seek to end of file + cur_byte_ = num_bytes_; + cur_elem_ = length_; + cur_elem_length_ = 0; + + // Return the result + return Slice(result_.data(),result_.size()); + } + + public: // Static public functions + + /// An upper-bound on the amount of bytes needed to store this element. + /// This is used to hide representation information from the client. + /// E.G. This can be used to compute the bytes we want to Reserve(). + static uint32_t SizeOf(const Slice& elem) { + // [Integer Length . Data] + return sizeof(uint32_t) + elem.size(); + } + + private: // Private functions + + /// Initializes the result_ string. + /// It will fill the first few bytes with 0's so that there is + /// enough space for header information when we need to write later. + /// Currently, "header information" means: the length (number of elements) + /// Assumes that result_ is empty to begin with + void InitializeResult() { + assert(result_.empty()); // Should always be true. + result_.resize(sizeof(uint32_t),0); // Put a block of 0's as the header + } + + /// Go to the next element (used in Push() and Skip()) + void MoveNext() { + CheckErrors(); + + // Check to make sure we are not already in a finished state + if (Done()) { + ThrowError("Attempting to iterate past end of list."); + } + + // Move forward one element. + cur_byte_ += sizeof(cur_elem_length_) + cur_elem_length_; + ++cur_elem_; + + // If we are at the end, finish + if (Done()) { + cur_elem_length_ = 0; + return; + } + + // Otherwise, we should be able to read the new element's length + if (cur_byte_ + sizeof(cur_elem_length_) > num_bytes_) { + ThrowError("Corrupt element data."); + } + + // Set the new element's length + cur_elem_length_ = DecodeFixed32(data_+cur_byte_); + + return; + } + + /// Append the current element (pointed to by cur_byte_) to result_ + /// Assumes result_ has already been reserved appropriately. + void WriteCurrentElement() { + // First verify that the iterator is still valid. + CheckErrors(); + if (Done()) { + ThrowError("Attempting to write invalid element."); + } + + // Append the cur element. + result_.insert(result_.end(), + data_+cur_byte_, + data_+cur_byte_+ sizeof(uint32_t) + cur_elem_length_); + } + + /// Will ThrowError() if neccessary. + /// Checks for common/ubiquitous errors that can arise after most operations. + /// This method should be called before any reading operation. + /// If this function succeeds, then we are guaranteed to be in a valid state. + /// Other member functions should check for errors and ThrowError() also + /// if an error occurs that is specific to it even while in a valid state. + void CheckErrors() { + // Check if any crazy thing has happened recently + if ((cur_elem_ > length_) || // Bad index + (cur_byte_ > num_bytes_) || // No more bytes + (cur_byte_ + cur_elem_length_ > num_bytes_) || // Item too large + (cur_byte_ == num_bytes_ && cur_elem_ != length_) || // Too many items + (cur_elem_ == length_ && cur_byte_ != num_bytes_)) { // Too many bytes + ThrowError("Corrupt data."); + } + } + + /// Will throw an exception based on the passed-in message. + /// This function is guaranteed to STOP THE CONTROL-FLOW. + /// (i.e.: you do not have to call "return" after calling ThrowError) + void ThrowError(const char* const msg = NULL) { + // TODO: For now we ignore the msg parameter. This can be expanded later. + throw RedisListException(); + } + + private: + const char* const data_; // A pointer to the data (the first byte) + const uint32_t num_bytes_; // The number of bytes in this list + + uint32_t cur_byte_; // The current byte being read + uint32_t cur_elem_; // The current element being read + uint32_t cur_elem_length_; // The number of bytes in current element + + uint32_t length_; // The number of elements in this list + std::vector result_; // The output data +}; + +} // namespace rocksdb diff --git a/utilities/redis/redis_lists.cc b/utilities/redis/redis_lists.cc new file mode 100644 index 00000000..50c544a3 --- /dev/null +++ b/utilities/redis/redis_lists.cc @@ -0,0 +1,551 @@ +/** + * A (persistent) Redis API built using the rocksdb backend. + * Implements Redis Lists as described on: http://redis.io/commands#list + * + * @throws All functions may throw a RedisListException on error/corruption. + * + * @notes Internally, the set of lists is stored in a rocksdb database, + * mapping keys to values. Each "value" is the list itself, storing + * some kind of internal representation of the data. All the + * representation details are handled by the RedisListIterator class. + * The present file should be oblivious to the representation details, + * handling only the client (Redis) API, and the calls to rocksdb. + * + * @TODO Presently, all operations take at least O(NV) time where + * N is the number of elements in the list, and V is the average + * number of bytes per value in the list. So maybe, with merge operator + * we can improve this to an optimal O(V) amortized time, since we + * wouldn't have to read and re-write the entire list. + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#include "redis_lists.h" + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace rocksdb +{ + +/// Constructors + +RedisLists::RedisLists(const std::string& db_path, + Options options, bool destructive) + : put_option_(), + get_option_() { + + // Store the name of the database + db_name_ = db_path; + + // If destructive, destroy the DB before re-opening it. + if (destructive) { + DestroyDB(db_name_, Options()); + } + + // Now open and deal with the db + DB* db; + Status s = DB::Open(options, db_name_, &db); + if (!s.ok()) { + std::cerr << "ERROR " << s.ToString() << std::endl; + assert(false); + } + + db_ = std::unique_ptr(db); +} + + +/// Accessors + +// Number of elements in the list associated with key +// : throws RedisListException +int RedisLists::Length(const std::string& key) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Return the length + RedisListIterator it(data); + return it.Length(); +} + +// Get the element at the specified index in the (list: key) +// Returns ("") on out-of-bounds +// : throws RedisListException +bool RedisLists::Index(const std::string& key, int32_t index, + std::string* result) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Handle REDIS negative indices (from the end); fast iff Length() takes O(1) + if (index < 0) { + index = Length(key) - (-index); //replace (-i) with (N-i). + } + + // Iterate through the list until the desired index is found. + int curIndex = 0; + RedisListIterator it(data); + while(curIndex < index && !it.Done()) { + ++curIndex; + it.Skip(); + } + + // If we actually found the index + if (curIndex == index && !it.Done()) { + Slice elem; + it.GetCurrent(&elem); + if (result != NULL) { + *result = elem.ToString(); + } + + return true; + } else { + return false; + } +} + +// Return a truncated version of the list. +// First, negative values for first/last are interpreted as "end of list". +// So, if first == -1, then it is re-set to index: (Length(key) - 1) +// Then, return exactly those indices i such that first <= i <= last. +// : throws RedisListException +std::vector RedisLists::Range(const std::string& key, + int32_t first, int32_t last) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative bounds (-1 means last element, etc.) + int listLen = Length(key); + if (first < 0) { + first = listLen - (-first); // Replace (-x) with (N-x) + } + if (last < 0) { + last = listLen - (-last); + } + + // Verify bounds (and truncate the range so that it is valid) + first = std::max(first, 0); + last = std::min(last, listLen-1); + int len = std::max(last-first+1, 0); + + // Initialize the resulting list + std::vector result(len); + + // Traverse the list and update the vector + int curIdx = 0; + Slice elem; + for (RedisListIterator it(data); !it.Done() && curIdx<=last; it.Skip()) { + if (first <= curIdx && curIdx <= last) { + it.GetCurrent(&elem); + result[curIdx-first].assign(elem.data(),elem.size()); + } + + ++curIdx; + } + + // Return the result. Might be empty + return result; +} + +// Print the (list: key) out to stdout. For debugging mostly. Public for now. +void RedisLists::Print(const std::string& key) { + // Extract the string data representing the list. + std::string data; + db_->Get(get_option_, key, &data); + + // Iterate through the list and print the items + Slice elem; + for (RedisListIterator it(data); !it.Done(); it.Skip()) { + it.GetCurrent(&elem); + std::cout << "ITEM " << elem.ToString() << std::endl; + } + + //Now print the byte data + RedisListIterator it(data); + std::cout << "==Printing data==" << std::endl; + std::cout << data.size() << std::endl; + std::cout << it.Size() << " " << it.Length() << std::endl; + Slice result = it.WriteResult(); + std::cout << result.data() << std::endl; + if (true) { + std::cout << "size: " << result.size() << std::endl; + const char* val = result.data(); + for(int i=0; i<(int)result.size(); ++i) { + std::cout << (int)val[i] << " " << (val[i]>=32?val[i]:' ') << std::endl; + } + std::cout << std::endl; + } +} + +/// Insert/Update Functions +/// Note: The "real" insert function is private. See below. + +// InsertBefore and InsertAfter are simply wrappers around the Insert function. +int RedisLists::InsertBefore(const std::string& key, const std::string& pivot, + const std::string& value) { + return Insert(key, pivot, value, false); +} + +int RedisLists::InsertAfter(const std::string& key, const std::string& pivot, + const std::string& value) { + return Insert(key, pivot, value, true); +} + +// Prepend value onto beginning of (list: key) +// : throws RedisListException +int RedisLists::PushLeft(const std::string& key, const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct the result + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + it.InsertElement(value); + + // Push the data back to the db and return the length + db_->Put(put_option_, key, it.WriteResult()); + return it.Length(); +} + +// Append value onto end of (list: key) +// TODO: Make this O(1) time. Might require MergeOperator. +// : throws RedisListException +int RedisLists::PushRight(const std::string& key, const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Create an iterator to the data and seek to the end. + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + while (!it.Done()) { + it.Push(); // Write each element as we go + } + + // Insert the new element at the current position (the end) + it.InsertElement(value); + + // Push it back to the db, and return length + db_->Put(put_option_, key, it.WriteResult()); + return it.Length(); +} + +// Set (list: key)[idx] = val. Return true on success, false on fail. +// : throws RedisListException +bool RedisLists::Set(const std::string& key, int32_t index, + const std::string& value) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative index for REDIS (meaning -index from end of list) + if (index < 0) { + index = Length(key) - (-index); + } + + // Iterate through the list until we find the element we want + int curIndex = 0; + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); // Over-estimate is fine + while(curIndex < index && !it.Done()) { + it.Push(); + ++curIndex; + } + + // If not found, return false (this occurs when index was invalid) + if (it.Done() || curIndex != index) { + return false; + } + + // Write the new element value, and drop the previous element value + it.InsertElement(value); + it.Skip(); + + // Write the data to the database + // Check status, since it needs to return true/false guarantee + Status s = db_->Put(put_option_, key, it.WriteResult()); + + // Success + return s.ok(); +} + +/// Delete / Remove / Pop functions + +// Trim (list: key) so that it will only contain the indices from start..stop +// Invalid indices will not generate an error, just empty, +// or the portion of the list that fits in this interval +// : throws RedisListException +bool RedisLists::Trim(const std::string& key, int32_t start, int32_t stop) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Handle negative indices in REDIS + int listLen = Length(key); + if (start < 0) { + start = listLen - (-start); + } + if (stop < 0) { + stop = listLen - (-stop); + } + + // Truncate bounds to only fit in the list + start = std::max(start, 0); + stop = std::min(stop, listLen-1); + + // Construct an iterator for the list. Drop all undesired elements. + int curIndex = 0; + RedisListIterator it(data); + it.Reserve(it.Size()); // Over-estimate + while(!it.Done()) { + // If not within the range, just skip the item (drop it). + // Otherwise, continue as usual. + if (start <= curIndex && curIndex <= stop) { + it.Push(); + } else { + it.Skip(); + } + + // Increment the current index + ++curIndex; + } + + // Write the (possibly empty) result to the database + Status s = db_->Put(put_option_, key, it.WriteResult()); + + // Return true as long as the write succeeded + return s.ok(); +} + +// Return and remove the first element in the list (or "" if empty) +// : throws RedisListException +bool RedisLists::PopLeft(const std::string& key, std::string* result) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Point to first element in the list (if it exists), and get its value/size + RedisListIterator it(data); + if (it.Length() > 0) { // Proceed only if list is non-empty + Slice elem; + it.GetCurrent(&elem); // Store the value of the first element + it.Reserve(it.Size() - it.SizeOf(elem)); + it.Skip(); // DROP the first item and move to next + + // Update the db + db_->Put(put_option_, key, it.WriteResult()); + + // Return the value + if (result != NULL) { + *result = elem.ToString(); + } + return true; + } else { + return false; + } +} + +// Remove and return the last element in the list (or "" if empty) +// TODO: Make this O(1). Might require MergeOperator. +// : throws RedisListException +bool RedisLists::PopRight(const std::string& key, std::string* result) { + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct an iterator to the data and move to last element + RedisListIterator it(data); + it.Reserve(it.Size()); + int len = it.Length(); + int curIndex = 0; + while(curIndex < (len-1) && !it.Done()) { + it.Push(); + ++curIndex; + } + + // Extract and drop/skip the last element + if (curIndex == len-1) { + assert(!it.Done()); // Sanity check. Should not have ended here. + + // Extract and pop the element + Slice elem; + it.GetCurrent(&elem); // Save value of element. + it.Skip(); // Skip the element + + // Write the result to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the value + if (result != NULL) { + *result = elem.ToString(); + } + return true; + } else { + // Must have been an empty list + assert(it.Done() && len==0 && curIndex == 0); + return false; + } +} + +// Remove the (first or last) "num" occurrences of value in (list: key) +// : throws RedisListException +int RedisLists::Remove(const std::string& key, int32_t num, + const std::string& value) { + // Negative num ==> RemoveLast; Positive num ==> Remove First + if (num < 0) { + return RemoveLast(key, -num, value); + } else if (num > 0) { + return RemoveFirst(key, num, value); + } else { + return RemoveFirst(key, Length(key), value); + } +} + +// Remove the first "num" occurrences of value in (list: key). +// : throws RedisListException +int RedisLists::RemoveFirst(const std::string& key, int32_t num, + const std::string& value) { + // Ensure that the number is positive + assert(num >= 0); + + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Traverse the list, appending all but the desired occurrences of value + int numSkipped = 0; // Keep track of the number of times value is seen + Slice elem; + RedisListIterator it(data); + it.Reserve(it.Size()); + while (!it.Done()) { + it.GetCurrent(&elem); + + if (elem == value && numSkipped < num) { + // Drop this item if desired + it.Skip(); + ++numSkipped; + } else { + // Otherwise keep the item and proceed as normal + it.Push(); + } + } + + // Put the result back to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the number of elements removed + return numSkipped; +} + + +// Remove the last "num" occurrences of value in (list: key). +// TODO: I traverse the list 2x. Make faster. Might require MergeOperator. +// : throws RedisListException +int RedisLists::RemoveLast(const std::string& key, int32_t num, + const std::string& value) { + // Ensure that the number is positive + assert(num >= 0); + + // Extract the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Temporary variable to hold the "current element" in the blocks below + Slice elem; + + // Count the total number of occurrences of value + int totalOccs = 0; + for (RedisListIterator it(data); !it.Done(); it.Skip()) { + it.GetCurrent(&elem); + if (elem == value) { + ++totalOccs; + } + } + + // Construct an iterator to the data. Reserve enough space for the result. + RedisListIterator it(data); + int bytesRemoved = std::min(num,totalOccs)*it.SizeOf(value); + it.Reserve(it.Size() - bytesRemoved); + + // Traverse the list, appending all but the desired occurrences of value. + // Note: "Drop the last k occurrences" is equivalent to + // "keep only the first n-k occurrences", where n is total occurrences. + int numKept = 0; // Keep track of the number of times value is kept + while(!it.Done()) { + it.GetCurrent(&elem); + + // If we are within the deletion range and equal to value, drop it. + // Otherwise, append/keep/push it. + if (elem == value) { + if (numKept < totalOccs - num) { + it.Push(); + ++numKept; + } else { + it.Skip(); + } + } else { + // Always append the others + it.Push(); + } + } + + // Put the result back to the database + db_->Put(put_option_, key, it.WriteResult()); + + // Return the number of elements removed + return totalOccs - numKept; +} + +/// Private functions + +// Insert element value into (list: key), right before/after +// the first occurrence of pivot +// : throws RedisListException +int RedisLists::Insert(const std::string& key, const std::string& pivot, + const std::string& value, bool insert_after) { + // Get the original list data + std::string data; + db_->Get(get_option_, key, &data); + + // Construct an iterator to the data and reserve enough space for result. + RedisListIterator it(data); + it.Reserve(it.Size() + it.SizeOf(value)); + + // Iterate through the list until we find the element we want + Slice elem; + bool found = false; + while(!it.Done() && !found) { + it.GetCurrent(&elem); + + // When we find the element, insert the element and mark found + if (elem == pivot) { // Found it! + found = true; + if (insert_after == true) { // Skip one more, if inserting after it + it.Push(); + } + it.InsertElement(value); + } else { + it.Push(); + } + + } + + // Put the data (string) into the database + if (found) { + db_->Put(put_option_, key, it.WriteResult()); + } + + // Returns the new (possibly unchanged) length of the list + return it.Length(); +} + + +} diff --git a/utilities/redis/redis_lists.h b/utilities/redis/redis_lists.h new file mode 100644 index 00000000..8c149bc4 --- /dev/null +++ b/utilities/redis/redis_lists.h @@ -0,0 +1,106 @@ +/** + * A (persistent) Redis API built using the rocksdb backend. + * Implements Redis Lists as described on: http://redis.io/commands#list + * + * @throws All functions may throw a RedisListException + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + +#pragma once + +#include +#include "rocksdb/db.h" +#include "redis_list_iterator.h" +#include "redis_list_exception.h" + +namespace rocksdb { + +/// The Redis functionality (see http://redis.io/commands#list) +/// All functions may THROW a RedisListException +class RedisLists { + public: // Constructors / Destructors + /// Construct a new RedisLists database, with name/path of db. + /// Will clear the database on open iff destructive is true (default false). + /// Otherwise, it will restore saved changes. + /// May throw RedisListException + RedisLists(const std::string& db_path, + Options options, bool destructive = false); + + public: // Accessors + /// The number of items in (list: key) + int Length(const std::string& key); + + /// Search the list for the (index)'th item (0-based) in (list:key) + /// A negative index indicates: "from end-of-list" + /// If index is within range: return true, and return the value in *result. + /// If (index < -length OR index>=length), then index is out of range: + /// return false (and *result is left unchanged) + /// May throw RedisListException + bool Index(const std::string& key, int32_t index, + std::string* result); + + /// Return (list: key)[first..last] (inclusive) + /// May throw RedisListException + std::vector Range(const std::string& key, + int32_t first, int32_t last); + + /// Prints the entire (list: key), for debugging. + void Print(const std::string& key); + + public: // Insert/Update + /// Insert value before/after pivot in (list: key). Return the length. + /// May throw RedisListException + int InsertBefore(const std::string& key, const std::string& pivot, + const std::string& value); + int InsertAfter(const std::string& key, const std::string& pivot, + const std::string& value); + + /// Push / Insert value at beginning/end of the list. Return the length. + /// May throw RedisListException + int PushLeft(const std::string& key, const std::string& value); + int PushRight(const std::string& key, const std::string& value); + + /// Set (list: key)[idx] = val. Return true on success, false on fail + /// May throw RedisListException + bool Set(const std::string& key, int32_t index, const std::string& value); + + public: // Delete / Remove / Pop / Trim + /// Trim (list: key) so that it will only contain the indices from start..stop + /// Returns true on success + /// May throw RedisListException + bool Trim(const std::string& key, int32_t start, int32_t stop); + + /// If list is empty, return false and leave *result unchanged. + /// Else, remove the first/last elem, store it in *result, and return true + bool PopLeft(const std::string& key, std::string* result); // First + bool PopRight(const std::string& key, std::string* result); // Last + + /// Remove the first (or last) num occurrences of value from the list (key) + /// Return the number of elements removed. + /// May throw RedisListException + int Remove(const std::string& key, int32_t num, + const std::string& value); + int RemoveFirst(const std::string& key, int32_t num, + const std::string& value); + int RemoveLast(const std::string& key, int32_t num, + const std::string& value); + + private: // Private Functions + /// Calls InsertBefore or InsertAfter + int Insert(const std::string& key, const std::string& pivot, + const std::string& value, bool insert_after); + private: + std::string db_name_; // The actual database name/path + WriteOptions put_option_; + ReadOptions get_option_; + + /// The backend rocksdb database. + /// Map : key --> list + /// where a list is a sequence of elements + /// and an element is a 4-byte integer (n), followed by n bytes of data + std::unique_ptr db_; +}; + +} // namespace rocksdb diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc new file mode 100644 index 00000000..0600e0e5 --- /dev/null +++ b/utilities/redis/redis_lists_test.cc @@ -0,0 +1,875 @@ +/** + * A test harness for the Redis API built on rocksdb. + * + * USAGE: Build with: "make redis_test" (in rocksdb directory). + * Run unit tests with: "./redis_test" + * Manual/Interactive user testing: "./redis_test -m" + * Manual user testing + restart database: "./redis_test -m -d" + * + * TODO: Add LARGE random test cases to verify efficiency and scalability + * + * @author Deon Nicholas (dnicholas@fb.com) + * Copyright 2013 Facebook + */ + + +#include +#include + +#include "redis_lists.h" +#include "util/testharness.h" +#include "util/random.h" + +using namespace rocksdb; +using namespace std; + +namespace rocksdb { + +class RedisListsTest { + public: + static const string kDefaultDbName; + static Options options; + + RedisListsTest() { + options.create_if_missing = true; + } +}; + +const string RedisListsTest::kDefaultDbName = "/tmp/redisdefaultdb/"; +Options RedisListsTest::options = Options(); + +// operator== and operator<< are defined below for vectors (lists) +// Needed for ASSERT_EQ + +void AssertListEq(const std::vector& result, + const std::vector& expected_result) { + ASSERT_EQ(result.size(), expected_result.size()); + for (size_t i = 0; i < result.size(); ++i) { + ASSERT_EQ(result[i], expected_result[i]); + } +} + +// PushRight, Length, Index, Range +TEST(RedisListsTest, SimpleTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple PushRight (should return the new length each time) + ASSERT_EQ(redis.PushRight("k1", "v1"), 1); + ASSERT_EQ(redis.PushRight("k1", "v2"), 2); + ASSERT_EQ(redis.PushRight("k1", "v3"), 3); + + // Check Length and Index() functions + ASSERT_EQ(redis.Length("k1"), 3); // Check length + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v1"); // Check valid indices + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Check range function and vectors + std::vector result = redis.Range("k1", 0, 2); // Get the list + std::vector expected_result(3); + expected_result[0] = "v1"; + expected_result[1] = "v2"; + expected_result[2] = "v3"; + AssertListEq(result, expected_result); +} + +// PushLeft, Length, Index, Range +TEST(RedisListsTest, SimpleTest2) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple PushRight + ASSERT_EQ(redis.PushLeft("k1", "v3"), 1); + ASSERT_EQ(redis.PushLeft("k1", "v2"), 2); + ASSERT_EQ(redis.PushLeft("k1", "v1"), 3); + + // Check Length and Index() functions + ASSERT_EQ(redis.Length("k1"), 3); // Check length + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v1"); // Check valid indices + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Check range function and vectors + std::vector result = redis.Range("k1", 0, 2); // Get the list + std::vector expected_result(3); + expected_result[0] = "v1"; + expected_result[1] = "v2"; + expected_result[2] = "v3"; + AssertListEq(result, expected_result); +} + +// Exhaustive test of the Index() function +TEST(RedisListsTest, IndexTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Empty Index check (return empty and should not crash or edit tempv) + tempv = "yo"; + ASSERT_TRUE(!redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "yo"); + ASSERT_TRUE(!redis.Index("fda", 3, &tempv)); + ASSERT_EQ(tempv, "yo"); + ASSERT_TRUE(!redis.Index("random", -12391, &tempv)); + ASSERT_EQ(tempv, "yo"); + + // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3] + redis.PushRight("k1", "v1"); + redis.PushRight("k1", "v2"); + redis.PushRight("k1", "v3"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v6"); + + // Simple, non-negative indices + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "v6"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "v1"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Negative indices + ASSERT_TRUE(redis.Index("k1", -6, &tempv)); + ASSERT_EQ(tempv, "v6"); + ASSERT_TRUE(redis.Index("k1", -5, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", -4, &tempv)); + ASSERT_EQ(tempv, "v4"); + ASSERT_TRUE(redis.Index("k1", -3, &tempv)); + ASSERT_EQ(tempv, "v1"); + ASSERT_TRUE(redis.Index("k1", -2, &tempv)); + ASSERT_EQ(tempv, "v2"); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "v3"); + + // Out of bounds (return empty, no crash) + ASSERT_TRUE(!redis.Index("k1", 6, &tempv)); + ASSERT_TRUE(!redis.Index("k1", 123219, &tempv)); + ASSERT_TRUE(!redis.Index("k1", -7, &tempv)); + ASSERT_TRUE(!redis.Index("k1", -129, &tempv)); +} + + +// Exhaustive test of the Range() function +TEST(RedisListsTest, RangeTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3]) + redis.PushRight("k1", "v1"); + redis.PushRight("k1", "v2"); + redis.PushRight("k1", "v3"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v4"); + redis.PushLeft("k1", "v6"); + + // Sanity check (check the length; make sure it's 6) + ASSERT_EQ(redis.Length("k1"), 6); + + // Simple range + std::vector res = redis.Range("k1", 1, 4); + ASSERT_EQ((int)res.size(), 4); + ASSERT_EQ(res[0], "v4"); + ASSERT_EQ(res[1], "v4"); + ASSERT_EQ(res[2], "v1"); + ASSERT_EQ(res[3], "v2"); + + // Negative indices (i.e.: measured from the end) + res = redis.Range("k1", 2, -1); + ASSERT_EQ((int)res.size(), 4); + ASSERT_EQ(res[0], "v4"); + ASSERT_EQ(res[1], "v1"); + ASSERT_EQ(res[2], "v2"); + ASSERT_EQ(res[3], "v3"); + + res = redis.Range("k1", -6, -4); + ASSERT_EQ((int)res.size(), 3); + ASSERT_EQ(res[0], "v6"); + ASSERT_EQ(res[1], "v4"); + ASSERT_EQ(res[2], "v4"); + + res = redis.Range("k1", -1, 5); + ASSERT_EQ((int)res.size(), 1); + ASSERT_EQ(res[0], "v3"); + + // Partial / Broken indices + res = redis.Range("k1", -3, 1000000); + ASSERT_EQ((int)res.size(), 3); + ASSERT_EQ(res[0], "v1"); + ASSERT_EQ(res[1], "v2"); + ASSERT_EQ(res[2], "v3"); + + res = redis.Range("k1", -1000000, 1); + ASSERT_EQ((int)res.size(), 2); + ASSERT_EQ(res[0], "v6"); + ASSERT_EQ(res[1], "v4"); + + // Invalid indices + res = redis.Range("k1", 7, 9); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", -8, -7); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", 3, 2); + ASSERT_EQ((int)res.size(), 0); + + res = redis.Range("k1", 5, -2); + ASSERT_EQ((int)res.size(), 0); + + // Range matches Index + res = redis.Range("k1", -6, -4); + ASSERT_TRUE(redis.Index("k1", -6, &tempv)); + ASSERT_EQ(tempv, res[0]); + ASSERT_TRUE(redis.Index("k1", -5, &tempv)); + ASSERT_EQ(tempv, res[1]); + ASSERT_TRUE(redis.Index("k1", -4, &tempv)); + ASSERT_EQ(tempv, res[2]); + + // Last check + res = redis.Range("k1", 0, -6); + ASSERT_EQ((int)res.size(), 1); + ASSERT_EQ(res[0], "v6"); +} + +// Exhaustive test for InsertBefore(), and InsertAfter() +TEST(RedisListsTest, InsertTest) { + RedisLists redis(kDefaultDbName, options, true); + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Insert on empty list (return 0, and do not crash) + ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "a"), 0); + ASSERT_EQ(redis.InsertAfter("k1", "other-non-exist", "c"), 0); + ASSERT_EQ(redis.Length("k1"), 0); + + // Push some preliminary stuff [g, f, e, d, c, b, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "b"); + redis.PushLeft("k1", "c"); + redis.PushLeft("k1", "d"); + redis.PushLeft("k1", "e"); + redis.PushLeft("k1", "f"); + redis.PushLeft("k1", "g"); + ASSERT_EQ(redis.Length("k1"), 7); + + // Test InsertBefore + int newLength = redis.InsertBefore("k1", "e", "hello"); + ASSERT_EQ(newLength, 8); + ASSERT_EQ(redis.Length("k1"), newLength); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "hello"); + + // Test InsertAfter + newLength = redis.InsertAfter("k1", "c", "bye"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "bye"); + + // Test bad value on InsertBefore + newLength = redis.InsertBefore("k1", "yo", "x"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test bad value on InsertAfter + newLength = redis.InsertAfter("k1", "xxxx", "y"); + ASSERT_EQ(newLength, 9); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test InsertBefore beginning + newLength = redis.InsertBefore("k1", "g", "begggggggggggggggg"); + ASSERT_EQ(newLength, 10); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Test InsertAfter end + newLength = redis.InsertAfter("k1", "a", "enddd"); + ASSERT_EQ(newLength, 11); + ASSERT_EQ(redis.Length("k1"), newLength); + + // Make sure nothing weird happened. + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "begggggggggggggggg"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "g"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "hello"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 7, &tempv)); + ASSERT_EQ(tempv, "bye"); + ASSERT_TRUE(redis.Index("k1", 8, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 9, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "enddd"); +} + +// Exhaustive test of Set function +TEST(RedisListsTest, SetTest) { + RedisLists redis(kDefaultDbName, options, true); + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Set on empty list (return false, and do not crash) + ASSERT_EQ(redis.Set("k1", 7, "a"), false); + ASSERT_EQ(redis.Set("k1", 0, "a"), false); + ASSERT_EQ(redis.Set("k1", -49, "cx"), false); + ASSERT_EQ(redis.Length("k1"), 0); + + // Push some preliminary stuff [g, f, e, d, c, b, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "b"); + redis.PushLeft("k1", "c"); + redis.PushLeft("k1", "d"); + redis.PushLeft("k1", "e"); + redis.PushLeft("k1", "f"); + redis.PushLeft("k1", "g"); + ASSERT_EQ(redis.Length("k1"), 7); + + // Test Regular Set + ASSERT_TRUE(redis.Set("k1", 0, "0")); + ASSERT_TRUE(redis.Set("k1", 3, "3")); + ASSERT_TRUE(redis.Set("k1", 6, "6")); + ASSERT_TRUE(redis.Set("k1", 2, "2")); + ASSERT_TRUE(redis.Set("k1", 5, "5")); + ASSERT_TRUE(redis.Set("k1", 1, "1")); + ASSERT_TRUE(redis.Set("k1", 4, "4")); + + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "0"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "1"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "2"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "3"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "4"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "5"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "6"); + + // Set with negative indices + ASSERT_TRUE(redis.Set("k1", -7, "a")); + ASSERT_TRUE(redis.Set("k1", -4, "d")); + ASSERT_TRUE(redis.Set("k1", -1, "g")); + ASSERT_TRUE(redis.Set("k1", -5, "c")); + ASSERT_TRUE(redis.Set("k1", -2, "f")); + ASSERT_TRUE(redis.Set("k1", -6, "b")); + ASSERT_TRUE(redis.Set("k1", -3, "e")); + + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "g"); + + // Bad indices (just out-of-bounds / off-by-one check) + ASSERT_EQ(redis.Set("k1", -8, "off-by-one in negative index"), false); + ASSERT_EQ(redis.Set("k1", 7, "off-by-one-error in positive index"), false); + ASSERT_EQ(redis.Set("k1", 43892, "big random index should fail"), false); + ASSERT_EQ(redis.Set("k1", -21391, "large negative index should fail"), false); + + // One last check (to make sure nothing weird happened) + ASSERT_EQ(redis.Length("k1"), 7); // Size should not change + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "b"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "c"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "d"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "e"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "f"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "g"); +} + +// Testing Insert, Push, and Set, in a mixed environment +TEST(RedisListsTest, InsertPushSetTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend] + // Also, check the return value sometimes (should return length) + int lengthCheck; + lengthCheck = redis.PushLeft("k1", "a"); + ASSERT_EQ(lengthCheck, 1); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + lengthCheck = redis.InsertAfter("k1", "a", "aftera"); + ASSERT_EQ(lengthCheck , 4); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore beginning of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + + // Check + std::vector res = redis.Range("k1", 0, -1); // Get the list + ASSERT_EQ((int)res.size(), 6); + ASSERT_EQ(res[0], "newbegin"); + ASSERT_EQ(res[5], "newend"); + ASSERT_EQ(res[3], "aftera"); + + // Testing duplicate values/pivots (multiple occurrences of 'a') + ASSERT_TRUE(redis.Set("k1", 0, "a")); // [a, z, a, aftera, x, newend] + redis.InsertAfter("k1", "a", "happy"); // [a, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "happy"); + redis.InsertBefore("k1", "a", "sad"); // [sad, a, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "sad"); + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "happy"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "aftera"); + redis.InsertAfter("k1", "a", "zz"); // [sad, a, zz, happy, z, a, aftera, ...] + ASSERT_TRUE(redis.Index("k1", 2, &tempv)); + ASSERT_EQ(tempv, "zz"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Set("k1", 1, "nota")); // [sad, nota, zz, happy, z, a, ...] + redis.InsertBefore("k1", "a", "ba"); // [sad, nota, zz, happy, z, ba, a, ...] + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "ba"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "a"); + + // We currently have: [sad, nota, zz, happy, z, ba, a, aftera, x, newend] + // redis.Print("k1"); // manually check + + // Test Inserting before/after non-existent values + lengthCheck = redis.Length("k1"); // Ensure that the length doesn't change + ASSERT_EQ(lengthCheck, 10); + ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "randval"), lengthCheck); + ASSERT_EQ(redis.InsertAfter("k1", "nothing", "a"), lengthCheck); + ASSERT_EQ(redis.InsertAfter("randKey", "randVal", "ranValue"), 0); // Empty + ASSERT_EQ(redis.Length("k1"), lengthCheck); // The length should not change + + // Simply Test the Set() function + redis.Set("k1", 5, "ba2"); + redis.InsertBefore("k1", "ba2", "beforeba2"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "beforeba2"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "ba2"); + ASSERT_TRUE(redis.Index("k1", 7, &tempv)); + ASSERT_EQ(tempv, "a"); + + // We have: [sad, nota, zz, happy, z, beforeba2, ba2, a, aftera, x, newend] + + // Set() with negative indices + redis.Set("k1", -1, "endprank"); + ASSERT_TRUE(!redis.Index("k1", 11, &tempv)); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "endprank"); // Ensure Set worked correctly + redis.Set("k1", -11, "t"); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "t"); + + // Test out of bounds Set + ASSERT_EQ(redis.Set("k1", -12, "ssd"), false); + ASSERT_EQ(redis.Set("k1", 11, "sasd"), false); + ASSERT_EQ(redis.Set("k1", 1200, "big"), false); +} + +// Testing Trim, Pop +TEST(RedisListsTest, TrimPopTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + + // Simple PopLeft/Right test + ASSERT_TRUE(redis.PopLeft("k1", &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_EQ(redis.Length("k1"), 5); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.PopRight("k1", &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "x"); + + // Now have: [z, a, aftera, x] + + // Test Trim + ASSERT_TRUE(redis.Trim("k1", 0, -1)); // [z, a, aftera, x] (do nothing) + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Trim("k1", 0, 2)); // [z, a, aftera] + ASSERT_EQ(redis.Length("k1"), 3); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Trim("k1", 1, 1)); // [a] + ASSERT_EQ(redis.Length("k1"), 1); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "a"); + + // Test out of bounds (empty) trim + ASSERT_TRUE(redis.Trim("k1", 1, 0)); + ASSERT_EQ(redis.Length("k1"), 0); + + // Popping with empty list (return empty without error) + ASSERT_TRUE(!redis.PopLeft("k1", &tempv)); + ASSERT_TRUE(!redis.PopRight("k1", &tempv)); + ASSERT_TRUE(redis.Trim("k1", 0, 5)); + + // Exhaustive Trim test (negative and invalid indices) + // Will start in [newbegin, z, a, aftera, x, newend] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + ASSERT_TRUE(redis.Trim("k1", -6, -1)); // Should do nothing + ASSERT_EQ(redis.Length("k1"), 6); + ASSERT_TRUE(redis.Trim("k1", 1, -2)); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "x"); + ASSERT_EQ(redis.Length("k1"), 4); + ASSERT_TRUE(redis.Trim("k1", -3, -2)); + ASSERT_EQ(redis.Length("k1"), 2); +} + +// Testing Remove, RemoveFirst, RemoveLast +TEST(RedisListsTest, RemoveTest) { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend, a, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + redis.PushRight("k1", "a"); + redis.PushRight("k1", "a"); + + // Verify + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "a"); + + // Check RemoveFirst (Remove the first two 'a') + // Results in [newbegin, z, aftera, x, newend, a] + int numRemoved = redis.Remove("k1", 2, "a"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", 1, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_TRUE(redis.Index("k1", 5, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_EQ(redis.Length("k1"), 6); + + // Repopulate some stuff + // Results in: [x, x, x, x, x, newbegin, z, x, aftera, x, newend, a, x] + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushLeft("k1", "x"); + redis.PushRight("k1", "x"); + redis.InsertAfter("k1", "z", "x"); + + // Test removal from end + numRemoved = redis.Remove("k1", -2, "x"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 8, &tempv)); + ASSERT_EQ(tempv, "aftera"); + ASSERT_TRUE(redis.Index("k1", 9, &tempv)); + ASSERT_EQ(tempv, "newend"); + ASSERT_TRUE(redis.Index("k1", 10, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(!redis.Index("k1", 11, &tempv)); + numRemoved = redis.Remove("k1", -2, "x"); + ASSERT_EQ(numRemoved, 2); + ASSERT_TRUE(redis.Index("k1", 4, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + ASSERT_TRUE(redis.Index("k1", 6, &tempv)); + ASSERT_EQ(tempv, "aftera"); + + // We now have: [x, x, x, x, newbegin, z, aftera, newend, a] + ASSERT_EQ(redis.Length("k1"), 9); + ASSERT_TRUE(redis.Index("k1", -1, &tempv)); + ASSERT_EQ(tempv, "a"); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "x"); + + // Test over-shooting (removing more than there exists) + numRemoved = redis.Remove("k1", -9000, "x"); + ASSERT_EQ(numRemoved , 4); // Only really removed 4 + ASSERT_EQ(redis.Length("k1"), 5); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "newbegin"); + numRemoved = redis.Remove("k1", 1, "x"); + ASSERT_EQ(numRemoved, 0); + + // Try removing ALL! + numRemoved = redis.Remove("k1", 0, "newbegin"); // REMOVE 0 will remove all! + ASSERT_EQ(numRemoved, 1); + + // Removal from an empty-list + ASSERT_TRUE(redis.Trim("k1", 1, 0)); + numRemoved = redis.Remove("k1", 1, "z"); + ASSERT_EQ(numRemoved, 0); +} + + +// Test Multiple keys and Persistence +TEST(RedisListsTest, PersistenceMultiKeyTest) { + + string tempv; // Used below for all Index(), PopRight(), PopLeft() + + // Block one: populate a single key in the database + { + RedisLists redis(kDefaultDbName, options, true); // Destructive + + // A series of pushes and insertions + // Will result in [newbegin, z, a, aftera, x, newend, a, a] + redis.PushLeft("k1", "a"); + redis.PushLeft("k1", "z"); + redis.PushRight("k1", "x"); + redis.InsertBefore("k1", "z", "newbegin"); // InsertBefore start of list + redis.InsertAfter("k1", "x", "newend"); // InsertAfter end of list + redis.InsertAfter("k1", "a", "aftera"); + redis.PushRight("k1", "a"); + redis.PushRight("k1", "a"); + + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "aftera"); + } + + // Block two: make sure changes were saved and add some other key + { + RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive + + // Check + ASSERT_EQ(redis.Length("k1"), 8); + ASSERT_TRUE(redis.Index("k1", 3, &tempv)); + ASSERT_EQ(tempv, "aftera"); + + redis.PushRight("k2", "randomkey"); + redis.PushLeft("k2", "sas"); + + redis.PopLeft("k1", &tempv); + } + + // Block three: Verify the changes from block 2 + { + RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive + + // Check + ASSERT_EQ(redis.Length("k1"), 7); + ASSERT_EQ(redis.Length("k2"), 2); + ASSERT_TRUE(redis.Index("k1", 0, &tempv)); + ASSERT_EQ(tempv, "z"); + ASSERT_TRUE(redis.Index("k2", -2, &tempv)); + ASSERT_EQ(tempv, "sas"); + } +} + +/// THE manual REDIS TEST begins here +/// THIS WILL ONLY OCCUR IF YOU RUN: ./redis_test -m + +void MakeUpper(std::string* const s) { + int len = s->length(); + for(int i=0; i + } +} + +/// Allows the user to enter in REDIS commands into the command-line. +/// This is useful for manual / interacticve testing / debugging. +/// Use destructive=true to clean the database before use. +/// Use destructive=false to remember the previous state (i.e.: persistent) +/// Should be called from main function. +int manual_redis_test(bool destructive){ + RedisLists redis(RedisListsTest::kDefaultDbName, + RedisListsTest::options, + destructive); + + // TODO: Right now, please use spaces to separate each word. + // In actual redis, you can use quotes to specify compound values + // Example: RPUSH mylist "this is a compound value" + + std::string command; + while(true) { + cin >> command; + MakeUpper(&command); + + if (command == "LINSERT") { + std::string k, t, p, v; + cin >> k >> t >> p >> v; + MakeUpper(&t); + if (t=="BEFORE") { + std::cout << redis.InsertBefore(k, p, v) << std::endl; + } else if (t=="AFTER") { + std::cout << redis.InsertAfter(k, p, v) << std::endl; + } + } else if (command == "LPUSH") { + std::string k, v; + std::cin >> k >> v; + redis.PushLeft(k, v); + } else if (command == "RPUSH") { + std::string k, v; + std::cin >> k >> v; + redis.PushRight(k, v); + } else if (command == "LPOP") { + std::string k; + std::cin >> k; + string res; + redis.PopLeft(k, &res); + std::cout << res << std::endl; + } else if (command == "RPOP") { + std::string k; + std::cin >> k; + string res; + redis.PopRight(k, &res); + std::cout << res << std::endl; + } else if (command == "LREM") { + std::string k; + int amt; + std::string v; + + std::cin >> k >> amt >> v; + std::cout << redis.Remove(k, amt, v) << std::endl; + } else if (command == "LLEN") { + std::string k; + std::cin >> k; + std::cout << redis.Length(k) << std::endl; + } else if (command == "LRANGE") { + std::string k; + int i, j; + std::cin >> k >> i >> j; + std::vector res = redis.Range(k, i, j); + for (auto it = res.begin(); it != res.end(); ++it) { + std::cout << " " << (*it); + } + std::cout << std::endl; + } else if (command == "LTRIM") { + std::string k; + int i, j; + std::cin >> k >> i >> j; + redis.Trim(k, i, j); + } else if (command == "LSET") { + std::string k; + int idx; + std::string v; + cin >> k >> idx >> v; + redis.Set(k, idx, v); + } else if (command == "LINDEX") { + std::string k; + int idx; + std::cin >> k >> idx; + string res; + redis.Index(k, idx, &res); + std::cout << res << std::endl; + } else if (command == "PRINT") { // Added by Deon + std::string k; + cin >> k; + redis.Print(k); + } else if (command == "QUIT") { + return 0; + } else { + std::cout << "unknown command: " << command << std::endl; + } + } +} + +} // namespace rocksdb + + +// USAGE: "./redis_test" for default (unit tests) +// "./redis_test -m" for manual testing (redis command api) +// "./redis_test -m -d" for destructive manual test (erase db before use) + + +// Check for "want" argument in the argument list +bool found_arg(int argc, char* argv[], const char* want){ + for(int i=1; icompaction_filter) { + options->compaction_filter = + new TtlCompactionFilter(ttl, options->compaction_filter); + } else { + options->compaction_filter_factory = + std::shared_ptr(new TtlCompactionFilterFactory( + ttl, options->compaction_filter_factory)); + } + + if (options->merge_operator) { + options->merge_operator.reset( + new TtlMergeOperator(options->merge_operator)); + } +} + +// Open the db inside DBWithTTL because options needs pointer to its ttl +DBWithTTL::DBWithTTL(DB* db) : StackableDB(db) {} + +DBWithTTL::~DBWithTTL() { + delete GetOptions().compaction_filter; +} + +Status UtilityDB::OpenTtlDB( + const Options& options, + const std::string& dbname, + StackableDB** dbptr, + int32_t ttl, + bool read_only) { + Status st; + Options options_to_open = options; + DBWithTTL::SanitizeOptions(ttl, &options_to_open); + DB* db; + + if (read_only) { + st = DB::OpenForReadOnly(options_to_open, dbname, &db); + } else { + st = DB::Open(options_to_open, dbname, &db); + } + if (st.ok()) { + *dbptr = new DBWithTTL(db); + } else { + delete db; + } + return st; +} + +// Gives back the current time +Status DBWithTTL::GetCurrentTime(int64_t& curtime) { + return Env::Default()->GetCurrentTime(&curtime); +} + +// Appends the current timestamp to the string. +// Returns false if could not get the current_time, true if append succeeds +Status DBWithTTL::AppendTS(const Slice& val, std::string& val_with_ts) { + val_with_ts.reserve(kTSLength + val.size()); + char ts_string[kTSLength]; + int64_t curtime; + Status st = GetCurrentTime(curtime); + if (!st.ok()) { + return st; + } + EncodeFixed32(ts_string, (int32_t)curtime); + val_with_ts.append(val.data(), val.size()); + val_with_ts.append(ts_string, kTSLength); + return st; +} + +// Returns corruption if the length of the string is lesser than timestamp, or +// timestamp refers to a time lesser than ttl-feature release time +Status DBWithTTL::SanityCheckTimestamp(const Slice& str) { + if (str.size() < kTSLength) { + return Status::Corruption("Error: value's length less than timestamp's\n"); + } + // Checks that TS is not lesser than kMinTimestamp + // Gaurds against corruption & normal database opened incorrectly in ttl mode + int32_t timestamp_value = + DecodeFixed32(str.data() + str.size() - kTSLength); + if (timestamp_value < kMinTimestamp){ + return Status::Corruption("Error: Timestamp < ttl feature release time!\n"); + } + return Status::OK(); +} + +// Checks if the string is stale or not according to TTl provided +bool DBWithTTL::IsStale(const Slice& value, int32_t ttl) { + if (ttl <= 0) { // Data is fresh if TTL is non-positive + return false; + } + int64_t curtime; + if (!GetCurrentTime(curtime).ok()) { + return false; // Treat the data as fresh if could not get current time + } + int32_t timestamp_value = + DecodeFixed32(value.data() + value.size() - kTSLength); + return (timestamp_value + ttl) < curtime; +} + +// Strips the TS from the end of the string +Status DBWithTTL::StripTS(std::string* str) { + Status st; + if (str->length() < kTSLength) { + return Status::Corruption("Bad timestamp in key-value"); + } + // Erasing characters which hold the TS + str->erase(str->length() - kTSLength, kTSLength); + return st; +} + +Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key, + const Slice& val) { + WriteBatch batch; + batch.Put(key, val); + return Write(opt, &batch); +} + +Status DBWithTTL::Get(const ReadOptions& options, + const Slice& key, + std::string* value) { + Status st = db_->Get(options, key, value); + if (!st.ok()) { + return st; + } + st = SanityCheckTimestamp(*value); + if (!st.ok()) { + return st; + } + return StripTS(value); +} + +std::vector DBWithTTL::MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + return std::vector(keys.size(), + Status::NotSupported("MultiGet not\ + supported with TTL")); +} + +bool DBWithTTL::KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found) { + bool ret = db_->KeyMayExist(options, key, value, value_found); + if (ret && value != nullptr && value_found != nullptr && *value_found) { + if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { + return false; + } + } + return ret; +} + +Status DBWithTTL::Merge(const WriteOptions& opt, + const Slice& key, + const Slice& value) { + WriteBatch batch; + batch.Merge(key, value); + return Write(opt, &batch); +} + +Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) { + class Handler : public WriteBatch::Handler { + public: + WriteBatch updates_ttl; + Status batch_rewrite_status; + virtual void Put(const Slice& key, const Slice& value) { + std::string value_with_ts; + Status st = AppendTS(value, value_with_ts); + if (!st.ok()) { + batch_rewrite_status = st; + } else { + updates_ttl.Put(key, value_with_ts); + } + } + virtual void Merge(const Slice& key, const Slice& value) { + std::string value_with_ts; + Status st = AppendTS(value, value_with_ts); + if (!st.ok()) { + batch_rewrite_status = st; + } else { + updates_ttl.Merge(key, value_with_ts); + } + } + virtual void Delete(const Slice& key) { + updates_ttl.Delete(key); + } + virtual void LogData(const Slice& blob) { + updates_ttl.PutLogData(blob); + } + }; + Handler handler; + updates->Iterate(&handler); + if (!handler.batch_rewrite_status.ok()) { + return handler.batch_rewrite_status; + } else { + return db_->Write(opts, &(handler.updates_ttl)); + } +} + +Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) { + return new TtlIterator(db_->NewIterator(opts)); +} + +void DBWithTTL::TEST_Destroy_DBWithTtl() { + ((DBImpl*) db_)->TEST_Destroy_DBImpl(); +} + +} // namespace rocksdb diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h new file mode 100644 index 00000000..519ae32c --- /dev/null +++ b/utilities/ttl/db_ttl.h @@ -0,0 +1,321 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/merge_operator.h" +#include "utilities/utility_db.h" +#include "db/db_impl.h" + +namespace rocksdb { + +class DBWithTTL : public StackableDB { + public: + static void SanitizeOptions(int32_t ttl, Options* options); + + explicit DBWithTTL(DB* db); + + virtual ~DBWithTTL(); + + virtual Status Put(const WriteOptions& o, const Slice& key, + const Slice& val) override; + + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value) override; + + virtual std::vector MultiGet( + const ReadOptions& options, const std::vector& keys, + std::vector* values) override; + + virtual bool KeyMayExist(const ReadOptions& options, + const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + virtual Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) override; + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + virtual Iterator* NewIterator(const ReadOptions& opts) override; + + // Simulate a db crash, no elegant closing of database. + void TEST_Destroy_DBWithTtl(); + + virtual DB* GetBaseDB() { + return db_; + } + + static bool IsStale(const Slice& value, int32_t ttl); + + static Status AppendTS(const Slice& val, std::string& val_with_ts); + + static Status SanityCheckTimestamp(const Slice& str); + + static Status StripTS(std::string* str); + + static Status GetCurrentTime(int64_t& curtime); + + static const uint32_t kTSLength = sizeof(int32_t); // size of timestamp + + static const int32_t kMinTimestamp = 1368146402; // 05/09/2013:5:40PM GMT-8 + + static const int32_t kMaxTimestamp = 2147483647; // 01/18/2038:7:14PM GMT-8 +}; + +class TtlIterator : public Iterator { + + public: + explicit TtlIterator(Iterator* iter) + : iter_(iter) { + assert(iter_); + } + + ~TtlIterator() { + delete iter_; + } + + bool Valid() const { + return iter_->Valid(); + } + + void SeekToFirst() { + iter_->SeekToFirst(); + } + + void SeekToLast() { + iter_->SeekToLast(); + } + + void Seek(const Slice& target) { + iter_->Seek(target); + } + + void Next() { + iter_->Next(); + } + + void Prev() { + iter_->Prev(); + } + + Slice key() const { + return iter_->key(); + } + + int32_t timestamp() const { + return DecodeFixed32( + iter_->value().data() + iter_->value().size() - DBWithTTL::kTSLength); + } + + Slice value() const { + //TODO: handle timestamp corruption like in general iterator semantics + assert(DBWithTTL::SanityCheckTimestamp(iter_->value()).ok()); + Slice trimmed_value = iter_->value(); + trimmed_value.size_ -= DBWithTTL::kTSLength; + return trimmed_value; + } + + Status status() const { + return iter_->status(); + } + + private: + Iterator* iter_; +}; + +class TtlCompactionFilter : public CompactionFilter { + + public: + TtlCompactionFilter( + int32_t ttl, + const CompactionFilter* user_comp_filter, + std::unique_ptr + user_comp_filter_from_factory = nullptr) + : ttl_(ttl), + user_comp_filter_(user_comp_filter), + user_comp_filter_from_factory_(std::move(user_comp_filter_from_factory)) { + // Unlike the merge operator, compaction filter is necessary for TTL, hence + // this would be called even if user doesn't specify any compaction-filter + if (!user_comp_filter_) { + user_comp_filter_ = user_comp_filter_from_factory_.get(); + } + } + + virtual bool Filter(int level, + const Slice& key, + const Slice& old_val, + std::string* new_val, + bool* value_changed) const override { + if (DBWithTTL::IsStale(old_val, ttl_)) { + return true; + } + if (user_comp_filter_ == nullptr) { + return false; + } + assert(old_val.size() >= DBWithTTL::kTSLength); + Slice old_val_without_ts(old_val.data(), + old_val.size() - DBWithTTL::kTSLength); + if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val, + value_changed)) { + return true; + } + if (*value_changed) { + new_val->append(old_val.data() + old_val.size() - DBWithTTL::kTSLength, + DBWithTTL::kTSLength); + } + return false; + } + + virtual const char* Name() const override { + return "Delete By TTL"; + } + + private: + int32_t ttl_; + const CompactionFilter* user_comp_filter_; + std::unique_ptr user_comp_filter_from_factory_; +}; + +class TtlCompactionFilterFactory : public CompactionFilterFactory { + public: + TtlCompactionFilterFactory( + int32_t ttl, + std::shared_ptr comp_filter_factory) + : ttl_(ttl), + user_comp_filter_factory_(comp_filter_factory) { } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) { + return std::unique_ptr( + new TtlCompactionFilter( + ttl_, + nullptr, + std::move(user_comp_filter_factory_->CreateCompactionFilter(context)) + ) + ); + } + + virtual const char* Name() const override { + return "TtlCompactionFilterFactory"; + } + + private: + int32_t ttl_; + std::shared_ptr user_comp_filter_factory_; +}; + +class TtlMergeOperator : public MergeOperator { + + public: + explicit TtlMergeOperator(const std::shared_ptr merge_op) + : user_merge_op_(merge_op) { + assert(merge_op); + } + + virtual bool FullMerge(const Slice& key, + const Slice* existing_value, + const std::deque& operands, + std::string* new_value, + Logger* logger) const override { + const uint32_t ts_len = DBWithTTL::kTSLength; + if (existing_value && existing_value->size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from existing value."); + return false; + } + + // Extract time-stamp from each operand to be passed to user_merge_op_ + std::deque operands_without_ts; + for (const auto &operand : operands) { + if (operand.size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from operand value."); + return false; + } + operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len)); + } + + // Apply the user merge operator (store result in *new_value) + bool good = true; + if (existing_value) { + Slice existing_value_without_ts(existing_value->data(), + existing_value->size() - ts_len); + good = user_merge_op_->FullMerge(key, &existing_value_without_ts, + operands_without_ts, new_value, logger); + } else { + good = user_merge_op_->FullMerge(key, nullptr, operands_without_ts, + new_value, logger); + } + + // Return false if the user merge operator returned false + if (!good) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!DBWithTTL::GetCurrentTime(curtime).ok()) { + Log(logger, "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } + } + + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, Logger* logger) const + override { + const uint32_t ts_len = DBWithTTL::kTSLength; + std::deque operands_without_ts; + + for (const auto& operand : operand_list) { + if (operand.size() < ts_len) { + Log(logger, "Error: Could not remove timestamp from value."); + return false; + } + + operands_without_ts.push_back( + Slice(operand.data(), operand.size() - ts_len)); + } + + // Apply the user partial-merge operator (store result in *new_value) + assert(new_value); + if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value, + logger)) { + return false; + } + + // Augment the *new_value with the ttl time-stamp + int64_t curtime; + if (!DBWithTTL::GetCurrentTime(curtime).ok()) { + Log(logger, "Error: Could not get current time to be attached internally " + "to the new value."); + return false; + } else { + char ts_string[ts_len]; + EncodeFixed32(ts_string, (int32_t)curtime); + new_value->append(ts_string, ts_len); + return true; + } + + } + + virtual const char* Name() const override { + return "Merge By TTL"; + } + + private: + std::shared_ptr user_merge_op_; +}; + +} diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc new file mode 100644 index 00000000..78912872 --- /dev/null +++ b/utilities/ttl/ttl_test.cc @@ -0,0 +1,504 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "rocksdb/compaction_filter.h" +#include "utilities/utility_db.h" +#include "util/testharness.h" +#include "util/logging.h" +#include +#include + +namespace rocksdb { + +namespace { + +typedef std::map KVMap; + +enum BatchOperation { + PUT = 0, + DELETE = 1 +}; + +} + +class TtlTest { + public: + TtlTest() { + dbname_ = test::TmpDir() + "/db_ttl"; + options_.create_if_missing = true; + // ensure that compaction is kicked in to always strip timestamp from kvs + options_.max_grandparent_overlap_factor = 0; + // compaction should take place always from level0 for determinism + options_.max_mem_compaction_level = 0; + db_ttl_ = nullptr; + DestroyDB(dbname_, Options()); + } + + ~TtlTest() { + CloseTtl(); + DestroyDB(dbname_, Options()); + } + + // Open database with TTL support when TTL not provided with db_ttl_ pointer + void OpenTtl() { + assert(db_ttl_ == nullptr); // db should be closed before opening again + ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_)); + } + + // Open database with TTL support when TTL provided with db_ttl_ pointer + void OpenTtl(int32_t ttl) { + assert(db_ttl_ == nullptr); + ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_, ttl)); + } + + // Open with TestFilter compaction filter + void OpenTtlWithTestCompaction(int32_t ttl) { + options_.compaction_filter_factory = + std::shared_ptr( + new TestFilterFactory(kSampleSize_, kNewValue_)); + OpenTtl(ttl); + } + + // Open database with TTL support in read_only mode + void OpenReadOnlyTtl(int32_t ttl) { + assert(db_ttl_ == nullptr); + ASSERT_OK(UtilityDB::OpenTtlDB(options_, dbname_, &db_ttl_, ttl, true)); + } + + void CloseTtl() { + delete db_ttl_; + db_ttl_ = nullptr; + } + + // Populates and returns a kv-map + void MakeKVMap(int64_t num_entries) { + kvmap_.clear(); + int digits = 1; + for (int dummy = num_entries; dummy /= 10 ; ++digits); + int digits_in_i = 1; + for (int64_t i = 0; i < num_entries; i++) { + std::string key = "key"; + std::string value = "value"; + if (i % 10 == 0) { + digits_in_i++; + } + for(int j = digits_in_i; j < digits; j++) { + key.append("0"); + value.append("0"); + } + AppendNumberTo(&key, i); + AppendNumberTo(&value, i); + kvmap_[key] = value; + } + ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done + } + + // Makes a write-batch with key-vals from kvmap_ and 'Write''s it + void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) { + assert(num_ops <= (int)kvmap_.size()); + static WriteOptions wopts; + static FlushOptions flush_opts; + WriteBatch batch; + kv_it_ = kvmap_.begin(); + for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) { + switch (batch_ops[i]) { + case PUT: + batch.Put(kv_it_->first, kv_it_->second); + break; + case DELETE: + batch.Delete(kv_it_->first); + break; + default: + assert(false); + } + } + db_ttl_->Write(wopts, &batch); + db_ttl_->Flush(flush_opts); + } + + // Puts num_entries starting from start_pos_map from kvmap_ into the database + void PutValues(int start_pos_map, int num_entries, bool flush = true) { + assert(db_ttl_); + ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size()); + static WriteOptions wopts; + static FlushOptions flush_opts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, start_pos_map); + for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) { + ASSERT_OK(db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)); + } + // Put a mock kv at the end because CompactionFilter doesn't delete last key + ASSERT_OK(db_ttl_->Put(wopts, "keymock", "valuemock")); + if (flush) { + db_ttl_->Flush(flush_opts); + } + } + + // Runs a manual compaction + void ManualCompact() { + db_ttl_->CompactRange(nullptr, nullptr); + } + + // checks the whole kvmap_ to return correct values using KeyMayExist + void SimpleKeyMayExistCheck() { + static ReadOptions ropts; + bool value_found; + std::string val; + for(auto &kv : kvmap_) { + bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found); + if (ret == false || value_found == false) { + fprintf(stderr, "KeyMayExist could not find key=%s in the database but" + " should have\n", kv.first.c_str()); + assert(false); + } else if (val.compare(kv.second) != 0) { + fprintf(stderr, " value for key=%s present in database is %s but" + " should be %s\n", kv.first.c_str(), val.c_str(), + kv.second.c_str()); + assert(false); + } + } + } + + // Sleeps for slp_tim then runs a manual compaction + // Checks span starting from st_pos from kvmap_ in the db and + // Gets should return true if check is true and false otherwise + // Also checks that value that we got is the same as inserted; and =kNewValue + // if test_compaction_change is true + void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true, + bool test_compaction_change = false) { + assert(db_ttl_); + sleep(slp_tim); + ManualCompact(); + static ReadOptions ropts; + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + std::string v; + for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) { + Status s = db_ttl_->Get(ropts, kv_it_->first, &v); + if (s.ok() != check) { + fprintf(stderr, "key=%s ", kv_it_->first.c_str()); + if (!s.ok()) { + fprintf(stderr, "is absent from db but was expected to be present\n"); + } else { + fprintf(stderr, "is present in db but was expected to be absent\n"); + } + assert(false); + } else if (s.ok()) { + if (test_compaction_change && v.compare(kNewValue_) != 0) { + fprintf(stderr, " value for key=%s present in database is %s but " + " should be %s\n", kv_it_->first.c_str(), v.c_str(), + kNewValue_.c_str()); + assert(false); + } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) { + fprintf(stderr, " value for key=%s present in database is %s but " + " should be %s\n", kv_it_->first.c_str(), v.c_str(), + kv_it_->second.c_str()); + assert(false); + } + } + } + } + + // Similar as SleepCompactCheck but uses TtlIterator to read from db + void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) { + assert(db_ttl_); + sleep(slp); + ManualCompact(); + static ReadOptions ropts; + Iterator *dbiter = db_ttl_->NewIterator(ropts); + kv_it_ = kvmap_.begin(); + advance(kv_it_, st_pos); + + dbiter->Seek(kv_it_->first); + if (!check) { + if (dbiter->Valid()) { + ASSERT_NE(dbiter->value().compare(kv_it_->second), 0); + } + } else { // dbiter should have found out kvmap_[st_pos] + for (int i = st_pos; + kv_it_ != kvmap_.end() && i < st_pos + span; + i++, kv_it_++) { + ASSERT_TRUE(dbiter->Valid()); + ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0); + dbiter->Next(); + } + } + delete dbiter; + } + + class TestFilter : public CompactionFilter { + public: + TestFilter(const int64_t kSampleSize, const std::string kNewValue) + : kSampleSize_(kSampleSize), + kNewValue_(kNewValue) { + } + + // Works on keys of the form "key" + // Drops key if number at the end of key is in [0, kSampleSize_/3), + // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3), + // Change value if it is in [2*kSampleSize_/3, kSampleSize_) + // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5... + virtual bool Filter(int level, const Slice& key, + const Slice& value, std::string* new_value, + bool* value_changed) const override { + assert(new_value != nullptr); + + std::string search_str = "0123456789"; + std::string key_string = key.ToString(); + size_t pos = key_string.find_first_of(search_str); + int num_key_end; + if (pos != std::string::npos) { + num_key_end = stoi(key_string.substr(pos, key.size() - pos)); + } else { + return false; // Keep keys not matching the format "key" + } + + int partition = kSampleSize_ / 3; + if (num_key_end < partition) { + return true; + } else if (num_key_end < partition * 2) { + return false; + } else { + *new_value = kNewValue_; + *value_changed = true; + return false; + } + } + + virtual const char* Name() const override { + return "TestFilter"; + } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + class TestFilterFactory : public CompactionFilterFactory { + public: + TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue) + : kSampleSize_(kSampleSize), + kNewValue_(kNewValue) { + } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr( + new TestFilter(kSampleSize_, kNewValue_)); + } + + virtual const char* Name() const override { + return "TestFilterFactory"; + } + + private: + const int64_t kSampleSize_; + const std::string kNewValue_; + }; + + + // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer + const int64_t kSampleSize_ = 100; + + private: + std::string dbname_; + StackableDB* db_ttl_; + Options options_; + KVMap kvmap_; + KVMap::iterator kv_it_; + const std::string kNewValue_ = "new_value"; + unique_ptr test_comp_filter_; +}; // class TtlTest + +// If TTL is non positive or not provided, the behaviour is TTL = infinity +// This test opens the db 3 times with such default behavior and inserts a +// bunch of kvs each time. All kvs should accumulate in the db till the end +// Partitions the sample-size provided into 3 sets over boundary1 and boundary2 +TEST(TtlTest, NoEffect) { + MakeKVMap(kSampleSize_); + int boundary1 = kSampleSize_ / 3; + int boundary2 = 2 * boundary1; + + OpenTtl(); + PutValues(0, boundary1); //T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); //T=1: Set1 still there + CloseTtl(); + + OpenTtl(0); + PutValues(boundary1, boundary2 - boundary1); //T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); //T=2: Sets1 & 2 still there + CloseTtl(); + + OpenTtl(-1); + PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); //T=4: Sets 1,2,3 still there + CloseTtl(); +} + +// Puts a set of values and checks its presence using Get during ttl +TEST(TtlTest, PresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_, true); // T=1:Set1 should still be there + CloseTtl(); +} + +// Puts a set of values and checks its absence using Get after ttl +TEST(TtlTest, AbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there + CloseTtl(); +} + +// Resets the timestamp of a set of kvs by updating them and checks that they +// are not deleted according to the old timestamp +TEST(TtlTest, ResetTimestamp) { + MakeKVMap(kSampleSize_); + + OpenTtl(3); + PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3 + sleep(2); // T=2 + PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5 + SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there + CloseTtl(); +} + +// Similar to PresentDuringTTL but uses Iterator +TEST(TtlTest, IterPresentDuringTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + SleepCompactCheckIter(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Similar to AbsentAfterTTL but uses Iterator +TEST(TtlTest, IterAbsentAfterTTL) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST(TtlTest, MultiOpenSamePresent) { + MakeKVMap(kSampleSize_); + + OpenTtl(2); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + CloseTtl(); + + OpenTtl(2); // T=0. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there + CloseTtl(); +} + +// Checks absence while opening the same db more than once with the same ttl +// Note: The second open will open the same db +TEST(TtlTest, MultiOpenSameAbsent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(1); // T=0.Delete at t=1 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there + CloseTtl(); +} + +// Checks presence while opening the same db more than once with bigger ttl +TEST(TtlTest, MultiOpenDifferent) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + CloseTtl(); + + OpenTtl(3); // T=0: Set deleted at t=3 + SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there + CloseTtl(); +} + +// Checks presence during ttl in read_only mode +TEST(TtlTest, ReadOnlyPresentForever) { + MakeKVMap(kSampleSize_); + + OpenTtl(1); // T=0:Open the db normally + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + CloseTtl(); + + OpenReadOnlyTtl(1); + SleepCompactCheck(2, 0, kSampleSize_); // T=2:Set1 should still be there + CloseTtl(); +} + +// Checks whether WriteBatch works well with TTL +// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half +TEST(TtlTest, WriteBatchTest) { + MakeKVMap(kSampleSize_); + BatchOperation batch_ops[kSampleSize_]; + for (int i = 0; i < kSampleSize_; i++) { + batch_ops[i] = PUT; + } + + OpenTtl(2); + MakePutWriteBatch(batch_ops, kSampleSize_); + for (int i = 0; i < kSampleSize_ / 2; i++) { + batch_ops[i] = DELETE; + } + MakePutWriteBatch(batch_ops, kSampleSize_ / 2); + SleepCompactCheck(0, 0, kSampleSize_ / 2, false); + SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2); + CloseTtl(); +} + +// Checks user's compaction filter for correctness with TTL logic +TEST(TtlTest, CompactionFilter) { + MakeKVMap(kSampleSize_); + + OpenTtlWithTestCompaction(1); + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there + SleepCompactCheck(2, 0, kSampleSize_, false); + CloseTtl(); + + OpenTtlWithTestCompaction(3); + PutValues(0, kSampleSize_); // T=0:Insert Set1. + int partition = kSampleSize_ / 3; + SleepCompactCheck(1, 0, partition, false); // Part dropped + SleepCompactCheck(0, partition, partition); // Part kept + SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed + CloseTtl(); +} + +// Insert some key-values which KeyMayExist should be able to get and check that +// values returned are fine +TEST(TtlTest, KeyMayExist) { + MakeKVMap(kSampleSize_); + + OpenTtl(); + PutValues(0, kSampleSize_, false); + + SimpleKeyMayExistCheck(); + + CloseTtl(); +} + +} // namespace rocksdb + +// A black-box test for the ttl wrapper around rocksdb +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +}