Introduce library-independent default compression level

author Andrew Kryczka <andrewkr@fb.com>

Thu, 24 May 2018 01:33:00 +0000 (18:33 -0700)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Thu, 24 May 2018 01:42:08 +0000 (18:42 -0700)
author Andrew Kryczka <andrewkr@fb.com>
Thu, 24 May 2018 01:33:00 +0000 (18:33 -0700)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Thu, 24 May 2018 01:42:08 +0000 (18:42 -0700)
diff --git a/HISTORY.md b/HISTORY.md

index c340d3fee4a326f007540a0a6972f998dd0e2265..5c9228cd1abd51e68d3cf29df818b4e6f99d4bbd 100644 (file)
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -13,6 +13,7 @@
  * Now, `DBOptions::use_direct_io_for_flush_and_compaction` only applies to background writes, and `DBOptions::use_direct_reads` applies to both user reads and background reads. This conforms with Linux's `open(2)` manpage, which advises against simultaneously reading a file in buffered and direct modes, due to possibly undefined behavior and degraded performance.
  * Iterator::Valid() always returns false if !status().ok(). So, now when doing a Seek() followed by some Next()s, there's no need to check status() after every operation.
  * Iterator::Seek()/SeekForPrev()/SeekToFirst()/SeekToLast() always resets status().
+* Introduced `CompressionOptions::kDefaultCompressionLevel`, which is a generic way to tell RocksDB to use the compression library's default level. It is now the default value for `CompressionOptions::level`. Previously the level defaulted to -1, which gave poor compression ratios in ZSTD.
  
  ### New Features
  * Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data.
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h

index 24b65d93e7b211bcf69f658b6da2258be652ad77..9848ff5238459a0b44c80b0ba377d16bc0ee7388 100644 (file)
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -87,6 +87,14 @@ struct CompactionOptionsFIFO {
  
  // Compression options for different compression algorithms like Zlib
  struct CompressionOptions {
+  // RocksDB's generic default compression level. Internally it'll be translated
+  // to the default compression level specific to the library being used (see
+  // comment above `ColumnFamilyOptions::compression`).
+  //
+  // The default value is the max 16-bit int as it'll be written out in OPTIONS
+  // file, which should be portable.
+  const static int kDefaultCompressionLevel = 32767;
+
    int window_bits;
    int level;
    int strategy;
@@ -120,7 +128,7 @@ struct CompressionOptions {
  
    CompressionOptions()
        : window_bits(-14),
-        level(-1),
+        level(kDefaultCompressionLevel),
          strategy(0),
          max_dict_bytes(0),
          zstd_max_train_bytes(0) {}
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h

index dcf703e64b2fe83a413766ad06639d4b6388cf31..74d7fef0a4caa1201f80aad63af0b8fa3a65acc7 100644 (file)
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -198,11 +198,21 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
    // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
    //    ~200-500MB/s compression
    //    ~400-800MB/s decompression
+  //
    // Note that these speeds are significantly faster than most
    // persistent storage speeds, and therefore it is typically never
    // worth switching to kNoCompression.  Even if the input data is
    // incompressible, the kSnappyCompression implementation will
    // efficiently detect that and will switch to uncompressed mode.
+  //
+  // If you do not set `compression_opts.level`, or set it to
+  // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
+  // default corresponding to `compression` as follows:
+  //
+  // - kZSTD: 3
+  // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
+  // - kLZ4HCCompression: 0
+  // - For all others, we do not specify a compression level
    CompressionType compression;
  
    // Compression algorithm that will be used for the bottommost level that
diff --git a/util/compression.h b/util/compression.h

index 21ed1a8ca2acac22616aaeaf7fa505226850aae4..69df6695f4182743029f48bff2a8cee252325a80 100644 (file)
--- a/util/compression.h
+++ b/util/compression.h
@@ -254,9 +254,15 @@ inline bool Zlib_Compress(const CompressionOptions& opts,
    // memLevel=9 uses maximum memory for optimal speed.
    // The default value is 8. See zconf.h for more details.
    static const int memLevel = 8;
+  int level;
+  if (opts.level == CompressionOptions::kDefaultCompressionLevel) {
+    level = Z_DEFAULT_COMPRESSION;
+  } else {
+    level = opts.level;
+  }
    z_stream _stream;
    memset(&_stream, 0, sizeof(z_stream));
-  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
+  int st = deflateInit2(&_stream, level, Z_DEFLATED, opts.window_bits,
                          memLevel, opts.strategy);
    if (st != Z_OK) {
      return false;
@@ -719,9 +725,15 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts,
    output->resize(static_cast<size_t>(output_header_len + compress_bound));
  
    int outlen;
+  int level;
+  if (opts.level == CompressionOptions::kDefaultCompressionLevel) {
+    level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
+  } else {
+    level = opts.level;
+  }
  #if LZ4_VERSION_NUMBER >= 10400  // r124+
    LZ4_streamHC_t* stream = LZ4_createStreamHC();
-  LZ4_resetStreamHC(stream, opts.level);
+  LZ4_resetStreamHC(stream, level);
    const char* compression_dict_data =
        compression_dict.size() > 0 ? compression_dict.data() : nullptr;
    size_t compression_dict_size = compression_dict.size();
@@ -742,7 +754,7 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts,
  #elif LZ4_VERSION_MAJOR  // r113-r123
    outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
                                           static_cast<int>(length),
-                                         compress_bound, opts.level);
+                                         compress_bound, level);
  #else                    // up to r112
    outlen =
        LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
@@ -807,15 +819,23 @@ inline bool ZSTD_Compress(const CompressionOptions& opts, const char* input,
    size_t compressBound = ZSTD_compressBound(length);
    output->resize(static_cast<size_t>(output_header_len + compressBound));
    size_t outlen;
+  int level;
+  if (opts.level == CompressionOptions::kDefaultCompressionLevel) {
+    // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+    // https://github.com/facebook/zstd/issues/1148
+    level = 3;
+  } else {
+    level = opts.level;
+  }
  #if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
    ZSTD_CCtx* context = ZSTD_createCCtx();
    outlen = ZSTD_compress_usingDict(
        context, &(*output)[output_header_len], compressBound, input, length,
-      compression_dict.data(), compression_dict.size(), opts.level);
+      compression_dict.data(), compression_dict.size(), level);
    ZSTD_freeCCtx(context);
  #else   // up to v0.4.x
    outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input,
-                         length, opts.level);
+                         length, level);
  #endif  // ZSTD_VERSION_NUMBER >= 500
    if (outlen == 0) {
      return false;
author	Andrew Kryczka <andrewkr@fb.com>
	Thu, 24 May 2018 01:33:00 +0000 (18:33 -0700)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Thu, 24 May 2018 01:42:08 +0000 (18:42 -0700)
HISTORY.md		patch \| blob \| history
include/rocksdb/advanced_options.h		patch \| blob \| history
include/rocksdb/options.h		patch \| blob \| history
util/compression.h		patch \| blob \| history