]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Add universal compaction to db_stress nightly build 2.2.fb v2.2
authorXing Jin <xjin@fb.com>
Wed, 21 Aug 2013 00:37:49 +0000 (17:37 -0700)
committerXing Jin <xjin@fb.com>
Wed, 21 Aug 2013 00:37:49 +0000 (17:37 -0700)
Summary:
Most code change in this diff is code cleanup/rewrite. The logic changes include:

(1) add universal compaction to db_crashtest2.py
(2) randomly set --test_batches_snapshots to be 0 or 1 in db_crashtest2.py. Old codes always use 1.
(3) use different tmp directory as db directory in different runs. I saw some intermittent errors in my local tests. Use of different tmp directory seems to be able to solve the issue.

Test Plan: Have run "make crashtest" for multiple times. Also run "make all check"

Reviewers: emayanke, dhruba, haobo

Reviewed By: emayanke

Differential Revision: https://reviews.facebook.net/D12369

tools/db_crashtest.py
tools/db_crashtest2.py

index 1e4eb666409afa7de11e522ca1d5eea5d3d6c4f8..1250198da23ea1983b27c299f4bdc90622ed4564 100644 (file)
@@ -1,5 +1,6 @@
 #! /usr/bin/env python
 import os
+import re
 import sys
 import time
 import random
@@ -8,18 +9,16 @@ import logging
 import tempfile
 import subprocess
 
-# This python script runs and kills db_stress multiple times with
-# test-batches-snapshot ON,
-# total operations much less than the total keys, and
-# a high read percentage.
-# This checks consistency in case of unsafe crashes in  Rocksdb
+# This script runs and kills db_stress multiple times. It checks consistency
+# in case of unsafe crashes in Rocksdb.
 
 def main(argv):
     try:
         opts, args = getopt.getopt(argv, "hd:t:i:o:b:")
     except getopt.GetoptError:
         print("db_crashtest.py -d <duration_test> -t <#threads> "
-              "-i <interval for one run> -o <ops_per_thread>\n")
+              "-i <interval for one run> -o <ops_per_thread> "
+              "-b <write_buffer_size>\n")
         sys.exit(2)
 
     # default values, will be overridden by cmdline args
@@ -36,15 +35,15 @@ def main(argv):
                   " -t <#threads> -i <interval for one run>"
                   " -o <ops_per_thread> -b <write_buffer_size>\n")
             sys.exit()
-        elif opt == ("-d"):
+        elif opt == "-d":
             duration = int(arg)
-        elif opt == ("-t"):
+        elif opt == "-t":
             threads = int(arg)
-        elif opt == ("-i"):
+        elif opt == "-i":
             interval = int(arg)
-        elif opt == ("-o"):
+        elif opt == "-o":
             ops_per_thread = int(arg)
-        elif opt == ("-b"):
+        elif opt == "-b":
             write_buf_size = int(arg)
         else:
             print("db_crashtest.py -d <duration_test>"
@@ -54,8 +53,6 @@ def main(argv):
 
     exit_time = time.time() + duration
 
-    dirpath = tempfile.mkdtemp()
-
     print("Running blackbox-crash-test with \ninterval_between_crash="
           + str(interval) + "\ntotal-duration=" + str(duration)
           + "\nthreads=" + str(threads) + "\nops_per_thread="
@@ -64,62 +61,75 @@ def main(argv):
 
     while time.time() < exit_time:
         run_had_errors = False
-        additional_opts = ' --disable_seek_compaction=' + \
-                          str(random.randint(0, 1)) + \
-                          ' --mmap_read=' + str(random.randint(0, 1)) + \
-                          ' --block_size=16384 ' + \
-                          ' --cache_size=1048576 ' + \
-                          ' --open_files=500000 ' + \
-                          ' --verify_checksum=1 ' + \
-                          ' --sync=' + str(random.randint(0, 1)) + \
-                          ' --disable_wal=0 ' + \
-                          ' --disable_data_sync=' + \
-                          str(random.randint(0, 1)) + \
-                          ' --target_file_size_base=2097152 ' + \
-                          ' --target_file_size_multiplier=2 ' + \
-                          ' --max_write_buffer_number=3 ' + \
-                          ' --max_background_compactions=20 ' + \
-                          ' --max_bytes_for_level_base=10485760 ' + \
-                          ' --filter_deletes=' + str(random.randint(0, 1))
         killtime = time.time() + interval
-        child = subprocess.Popen(['./db_stress \
-                        --test_batches_snapshots=1 \
-                        --ops_per_thread=0' + str(ops_per_thread) + ' \
-                        --threads=0' + str(threads) + ' \
-                        --write_buffer_size=' + str(write_buf_size) + '\
-                        --destroy_db_initially=0 \
-                        --reopen=0 \
-                        --readpercent=50 \
-                        --prefixpercent=5 \
-                        --writepercent=40 \
-                        --delpercent=5 \
-                        --db=' + dirpath + '\
-                        --max_key=100000000 ' + additional_opts],
+
+        cmd = re.sub('\s+', ' ', """
+            ./db_stress
+            --test_batches_snapshots=1
+            --ops_per_thread=%s
+            --threads=%s
+            --write_buffer_size=%s
+            --destroy_db_initially=0
+            --reopen=0
+            --readpercent=50
+            --prefixpercent=5
+            --writepercent=40
+            --delpercent=5
+            --db=%s
+            --max_key=100000000
+            --disable_seek_compaction=%s
+            --mmap_read=%s
+            --block_size=16384
+            --cache_size=1048576
+            --open_files=500000
+            --verify_checksum=1
+            --sync=%s
+            --disable_wal=0
+            --disable_data_sync=%s
+            --target_file_size_base=2097152
+            --target_file_size_multiplier=2
+            --max_write_buffer_number=3
+            --max_background_compactions=20
+            --max_bytes_for_level_base=10485760
+            --filter_deletes=%s
+            """ % (ops_per_thread,
+                   threads,
+                   write_buf_size,
+                   tempfile.mkdtemp(),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1)))
+
+        child = subprocess.Popen([cmd],
                                  stderr=subprocess.PIPE, shell=True)
-        print("Running db_stress with pid=%d and additional options=\n"
-              % child.pid + additional_opts + "\n")
-        time.sleep(interval)
-        while True:
-            if time.time() > killtime:
-                if child.poll() is not None:
-                    print("WARNING: db_stress ended before kill\n")
-                else:
-                    child.kill()
-                    print("KILLED %d\n" % child.pid)
-                    time.sleep(1)  # time to stabilize after a kill
+        print("Running db_stress with pid=%d: %s\n\n"
+              % (child.pid, cmd))
+
+        while time.time() < killtime:
+            time.sleep(10)
 
-                while True:
-                    line = child.stderr.readline().strip()
-                    if line != '':
-                        run_had_errors = True
-                        print('***' + line + '^')
-                    else:
-                        break
-                if run_had_errors:
-                    sys.exit(2)
+        if child.poll() is not None:
+            print("WARNING: db_stress ended before kill: exitcode=%d\n"
+                  % child.returncode)
+        else:
+            child.kill()
+            print("KILLED %d\n" % child.pid)
+            time.sleep(1)  # time to stabilize after a kill
+
+        while True:
+            line = child.stderr.readline().strip()
+            if line != '':
+                run_had_errors = True
+                print('***' + line + '^')
+            else:
                 break
 
-            time.sleep(1)  # time to stabilize before the next run
+        if run_had_errors:
+            sys.exit(2)
+
+        time.sleep(1)  # time to stabilize before the next run
 
 if __name__ == "__main__":
     sys.exit(main(sys.argv[1:]))
index a158574e65c86d7c8a79743aedac8cd029a74bfa..5830bbe0aae007f847b163ff7a6cce3bd2e719fb 100644 (file)
@@ -1,5 +1,6 @@
 #! /usr/bin/env python
 import os
+import re
 import sys
 import time
 import random
@@ -8,24 +9,22 @@ import logging
 import tempfile
 import subprocess
 
-# This python script runs db_stress multiple times with kill_random_test
-# that causes leveldb to crash at various points in code.
-# It also has test-batches-snapshot ON so that basic atomic/consistency
-# checks can be performed.
-#
+# This python script runs db_stress multiple times. Some runs with
+# kill_random_test that causes leveldb to crash at various points in code.
+
 def main(argv):
     try:
         opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
     except getopt.GetoptError:
         print str(getopt.GetoptError)
         print "db_crashtest2.py -d <duration_test> -t <#threads> " \
-            "-k <kills with prob 1/k> -o <ops_per_thread> "\
-            "-b <write_buffer_size>\n"
+              "-k <kills with prob 1/k> -o <ops_per_thread> "\
+              "-b <write_buffer_size>\n"
         sys.exit(2)
 
     # default values, will be overridden by cmdline args
     kill_random_test = 97  # kill with probability 1/97 by default
-    duration = 6000  # total time for this script to test db_stress
+    duration = 10000  # total time for this script to test db_stress
     threads = 32
     ops_per_thread = 200000
     write_buf_size = 4 * 1024 * 1024
@@ -33,93 +32,101 @@ def main(argv):
     for opt, arg in opts:
         if opt == '-h':
             print "db_crashtest2.py -d <duration_test> -t <#threads> " \
-                "-k <kills with prob 1/k> -o <ops_per_thread> "\
-                "-b <write_buffer_size>\n"
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
             sys.exit()
-        elif opt == ("-d"):
+        elif opt == "-d":
             duration = int(arg)
-        elif opt == ("-t"):
+        elif opt == "-t":
             threads = int(arg)
-        elif opt == ("-k"):
+        elif opt == "-k":
             kill_random_test = int(arg)
-        elif opt == ("-i"):
-            interval = int(arg)
-        elif opt == ("-o"):
+        elif opt == "-o":
             ops_per_thread = int(arg)
-        elif opt == ("-b"):
+        elif opt == "-b":
             write_buf_size = int(arg)
         else:
             print "unrecognized option " + str(opt) + "\n"
             print "db_crashtest2.py -d <duration_test> -t <#threads> " \
-                "-k <kills with prob 1/k> -o <ops_per_thread> " \
-                "-b <write_buffer_size>\n"
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
             sys.exit(2)
 
     exit_time = time.time() + duration
 
-    dirpath = tempfile.mkdtemp()
-
-    print("Running whitebox-crash-test with \ntotal-duration=" + str(duration)
-          + "\nthreads=" + str(threads) + "\nops_per_thread="
-          + str(ops_per_thread) + "\nwrite_buffer_size="
-          + str(write_buf_size) + "\n")
+    print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
+          + "\nthreads=" + str(threads) + "\nops_per_thread=" \
+          + str(ops_per_thread) + "\nwrite_buffer_size=" \
+          + str(write_buf_size) + "\n"
 
-    # kill in every alternate run. toggle tracks which run we are doing.
-    toggle = True
+    total_check_mode = 3
+    check_mode = 0
 
     while time.time() < exit_time:
-        run_had_errors = False
-        additional_opts = ' --disable_seek_compaction=' + \
-                          str(random.randint(0, 1)) + \
-                          ' --mmap_read=' + str(random.randint(0, 1)) + \
-                          ' --block_size=16384 ' + \
-                          ' --cache_size=1048576 ' + \
-                          ' --open_files=500000 ' + \
-                          ' --verify_checksum=1 ' + \
-                          ' --sync=' + str(random.randint(0, 1)) + \
-                          ' --disable_wal=0 ' + \
-                          ' --disable_data_sync=' + \
-                          str(random.randint(0, 1)) + \
-                          ' --target_file_size_base=2097152 ' + \
-                          ' --target_file_size_multiplier=2 ' + \
-                          ' --max_write_buffer_number=3 ' + \
-                          ' --max_background_compactions=20 ' + \
-                          ' --max_bytes_for_level_base=10485760 ' + \
-                          ' --filter_deletes=' + str(random.randint(0, 1))
-        print ("Running db_stress with additional options=\n"
-               + additional_opts + "\n")
-
-        if toggle:
-            # since we are going to kill anyway, use more ops per thread
-            new_ops_per_thread = 100 * ops_per_thread
-            killoption = '--kill_random_test=' + str(kill_random_test)
+        killoption = ""
+        if check_mode == 0:
+            # run with kill_random_test
+            killoption = " --kill_random_test=" + str(kill_random_test)
+            # use large ops per thread since we will kill it anyway
+            additional_opts = "--ops_per_thread=" + \
+                              str(100 * ops_per_thread) + killoption
+        elif check_mode == 1:
+            # normal run with universal compaction mode
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
+                              " --compaction_style=1"
         else:
-            new_ops_per_thread = ops_per_thread
-            killoption = ''
-
-        toggle = not toggle
-
-        cmd = ['./db_stress \
-                --test_batches_snapshots=1 \
-                --ops_per_thread=0' + str(new_ops_per_thread) + ' \
-                --threads=0' + str(threads) + ' \
-                --write_buffer_size=' + str(write_buf_size) + ' \
-                --destroy_db_initially=0 ' + killoption + ' \
-                --reopen=0 \
-                --readpercent=50 \
-                --prefixpercent=5 \
-                --writepercent=40 \
-                --delpercent=5 \
-                --db=' + dirpath + ' \
-                --max_key=100000000 ' + additional_opts]
-
-        popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+            # nomral run
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread)
+
+        cmd = re.sub('\s+', ' ', """
+            ./db_stress
+            --test_batches_snapshots=%s
+            --threads=%s
+            --write_buffer_size=%s
+            --destroy_db_initially=0
+            --reopen=0
+            --readpercent=50
+            --prefixpercent=5
+            --writepercent=40
+            --delpercent=5
+            --db=%s
+            --max_key=100000000
+            --disable_seek_compaction=%s
+            --mmap_read=%s
+            --block_size=16384
+            --cache_size=1048576
+            --open_files=500000
+            --verify_checksum=1
+            --sync=%s
+            --disable_wal=0
+            --disable_data_sync=%s
+            --target_file_size_base=2097152
+            --target_file_size_multiplier=2
+            --max_write_buffer_number=3
+            --max_background_compactions=20
+            --max_bytes_for_level_base=10485760
+            --filter_deletes=%s
+            %s
+            """ % (random.randint(0, 1),
+                   threads,
+                   write_buf_size,
+                   tempfile.mkdtemp(),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   random.randint(0, 1),
+                   additional_opts))
+
+        print "Running:" + cmd + "\n"
+
+        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  shell=True)
         stdoutdata, stderrdata = popen.communicate()
         retncode = popen.returncode
-        msg = ("kill option = {0}, exitcode = {1}".format(
-               killoption, retncode))
+        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
+               check_mode, killoption, retncode))
         print msg
         print stdoutdata
 
@@ -146,6 +153,9 @@ def main(argv):
         if (stdoutdata.find('fail') >= 0):
             print "TEST FAILED. Output has 'fail'!!!\n"
             sys.exit(2)
+
+        check_mode = (check_mode + 1) % total_check_mode
+
         time.sleep(1)  # time to stabilize after a kill
 
 if __name__ == "__main__":