From ef17972992807f57ed6ac5dbe49f2902d0127d3d Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Fri, 27 Sep 2019 19:37:11 +0200 Subject: [PATCH] bloom-filter: Improve test cases The integer bloom filter test cases do not really match typical usage of the bloom filter in actual Ceph code. In particular: - the tests use consecutive integer ranges, while Ceph code uses hash values uniformly distributed over the uint32_t space; - the tests pass "int" to the insert and contains functions, which causes the generic C++ POD type overload to be selected instead of the uint32_t overload that is used by Ceph code. The POD overload is dependent on host byte order, and behaves actually different from the uint32_t overload on little-endian systems. To fix these issues, this patch changes the integer tests to always pass in uint32_t (instead of int), and to use results of a pseudo-random number generator instead of consecutive sequences. (We assume the period of the generator is long enough that all values generated within one test instance are distinct.) This not only makes the test pass on both big- and little-endian hosts now, but it also allows tightening of the allowable actual false positive rates, as they now match much closer the expected values. Signed-off-by: Ulrich Weigand --- src/test/common/test_bloom_filter.cc | 38 ++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc index d360406e8ec..9eb45689b5a 100644 --- a/src/test/common/test_bloom_filter.cc +++ b/src/test/common/test_bloom_filter.cc @@ -28,7 +28,7 @@ TEST(BloomFilter, Basic) { TEST(BloomFilter, Empty) { bloom_filter bf; for (int i=0; i<100; ++i) { - ASSERT_FALSE(bf.contains(i)); + ASSERT_FALSE(bf.contains((uint32_t) i)); ASSERT_FALSE(bf.contains(stringify(i))); } } @@ -74,6 +74,7 @@ TEST(BloomFilter, Sweep) { } TEST(BloomFilter, SweepInt) { + unsigned int seed = 0; std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); std::cout.precision(5); std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl; @@ -87,13 +88,21 @@ TEST(BloomFilter, SweepInt) { ASSERT_TRUE(123); ASSERT_TRUE(456); + // In Ceph code, the uint32_t input routines to the bloom filter + // are used with hash values that are uniformly distributed over + // the uint32_t range. To model this behavior in the test, we + // pass in values generated by a pseudo-random generator. + // To make the test reproducible anyway, use a fixed seed here, + // but a different one in each instance. + srand(seed++); + for (int n = 0; n < max; n++) - bf.insert(n); + bf.insert((uint32_t) rand()); int test = max * 100; int hit = 0; for (int n = 0; n < test; n++) - if (bf.contains(100000 + n)) + if (bf.contains((uint32_t) rand())) hit++; ASSERT_TRUE(123); @@ -108,8 +117,8 @@ TEST(BloomFilter, SweepInt) { std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl; - ASSERT_TRUE(actual < fpp * 10); - ASSERT_TRUE(actual > fpp / 10); + ASSERT_TRUE(actual < fpp * 3); + ASSERT_TRUE(actual > fpp / 3); ASSERT_TRUE(bf.density() > 0.40); ASSERT_TRUE(bf.density() < 0.60); } @@ -118,6 +127,7 @@ TEST(BloomFilter, SweepInt) { TEST(BloomFilter, CompressibleSweep) { + unsigned int seed = 0; std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); std::cout.precision(5); std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n"; @@ -125,21 +135,29 @@ TEST(BloomFilter, CompressibleSweep) { int max = 1024; for (int div = 1; div < 10; div++) { compressible_bloom_filter bf(max, fpp, 1); + + // See the comment in SweepInt. + srand(seed++); + + std::vector values; int t = max/div; - for (int n = 0; n < t; n++) - bf.insert(n); + for (int n = 0; n < t; n++) { + uint32_t val = (uint32_t) rand(); + bf.insert(val); + values.push_back(val); + } unsigned est = bf.approx_unique_element_count(); if (div > 1) bf.compress(1.0 / div); - for (int n = 0; n < t; n++) - ASSERT_TRUE(bf.contains(n)); + for (auto val : values) + ASSERT_TRUE(bf.contains(val)); int test = max * 100; int hit = 0; for (int n = 0; n < test; n++) - if (bf.contains(100000 + n)) + if (bf.contains((uint32_t) rand())) hit++; double actual = (double)hit / (double)test; -- 2.39.5