From ef17972992807f57ed6ac5dbe49f2902d0127d3d Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Fri, 27 Sep 2019 19:37:11 +0200
Subject: [PATCH] bloom-filter: Improve test cases

The integer bloom filter test cases do not really match typical usage
of the bloom filter in actual Ceph code.  In particular:

- the tests use consecutive integer ranges, while Ceph code uses
  hash values uniformly distributed over the uint32_t space;

- the tests pass "int" to the insert and contains functions, which
  causes the generic C++ POD type overload to be selected instead
  of the uint32_t overload that is used by Ceph code.  The POD
  overload is dependent on host byte order, and behaves actually
  different from the uint32_t overload on little-endian systems.

To fix these issues, this patch changes the integer tests to
always pass in uint32_t (instead of int), and to use results
of a pseudo-random number generator instead of consecutive
sequences.  (We assume the period of the generator is long
enough that all values generated within one test instance
are distinct.)

This not only makes the test pass on both big- and little-endian
hosts now, but it also allows tightening of the allowable actual
false positive rates, as they now match much closer the expected
values.

Signed-off-by: Ulrich Weigand <ulrich.weigand@de.ibm.com>
---
 src/test/common/test_bloom_filter.cc | 38 ++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index d360406e8ec..9eb45689b5a 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -28,7 +28,7 @@ TEST(BloomFilter, Basic) {
 TEST(BloomFilter, Empty) {
   bloom_filter bf;
   for (int i=0; i<100; ++i) {
-    ASSERT_FALSE(bf.contains(i));
+    ASSERT_FALSE(bf.contains((uint32_t) i));
     ASSERT_FALSE(bf.contains(stringify(i)));
   }
 }
@@ -74,6 +74,7 @@ TEST(BloomFilter, Sweep) {
 }
 
 TEST(BloomFilter, SweepInt) {
+  unsigned int seed = 0;
   std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
   std::cout.precision(5);
   std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl;
@@ -87,13 +88,21 @@ TEST(BloomFilter, SweepInt) {
       ASSERT_TRUE(123);
       ASSERT_TRUE(456);
 
+      // In Ceph code, the uint32_t input routines to the bloom filter
+      // are used with hash values that are uniformly distributed over
+      // the uint32_t range.  To model this behavior in the test, we
+      // pass in values generated by a pseudo-random generator.
+      // To make the test reproducible anyway, use a fixed seed here,
+      // but a different one in each instance.
+      srand(seed++);
+
       for (int n = 0; n < max; n++)
-	bf.insert(n);
+	bf.insert((uint32_t) rand());
 
       int test = max * 100;
       int hit = 0;
       for (int n = 0; n < test; n++)
-	if (bf.contains(100000 + n))
+	if (bf.contains((uint32_t) rand()))
 	  hit++;
 
       ASSERT_TRUE(123);
@@ -108,8 +117,8 @@ TEST(BloomFilter, SweepInt) {
 
       std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert
 		<< "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl;
-      ASSERT_TRUE(actual < fpp * 10);
-      ASSERT_TRUE(actual > fpp / 10);
+      ASSERT_TRUE(actual < fpp * 3);
+      ASSERT_TRUE(actual > fpp / 3);
       ASSERT_TRUE(bf.density() > 0.40);
       ASSERT_TRUE(bf.density() < 0.60);
     }
@@ -118,6 +127,7 @@ TEST(BloomFilter, SweepInt) {
 
 
 TEST(BloomFilter, CompressibleSweep) {
+  unsigned int seed = 0;
   std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
   std::cout.precision(5);
   std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
@@ -125,21 +135,29 @@ TEST(BloomFilter, CompressibleSweep) {
   int max = 1024;
   for (int div = 1; div < 10; div++) {
     compressible_bloom_filter bf(max, fpp, 1);
+
+    // See the comment in SweepInt.
+    srand(seed++);
+
+    std::vector<uint32_t> values;
     int t = max/div;
-    for (int n = 0; n < t; n++)
-      bf.insert(n);
+    for (int n = 0; n < t; n++) {
+      uint32_t val = (uint32_t) rand();
+      bf.insert(val);
+      values.push_back(val);
+    }
 
     unsigned est = bf.approx_unique_element_count();
     if (div > 1)
       bf.compress(1.0 / div);
 
-    for (int n = 0; n < t; n++)
-      ASSERT_TRUE(bf.contains(n));
+    for (auto val : values)
+      ASSERT_TRUE(bf.contains(val));
 
     int test = max * 100;
     int hit = 0;
     for (int n = 0; n < test; n++)
-      if (bf.contains(100000 + n))
+      if (bf.contains((uint32_t) rand()))
 	hit++;
 
     double actual = (double)hit / (double)test;
-- 
2.39.5