crc32c: Add ppc64le fast zero optimized assembly.

author Andrew Solomon <asolomon@us.ibm.com>

Sun, 14 May 2017 04:52:11 +0000 (04:52 +0000)

committer Andrew Solomon <asolomon@us.ibm.com>

Wed, 31 May 2017 16:52:47 +0000 (16:52 +0000)
author Andrew Solomon <asolomon@us.ibm.com>
Sun, 14 May 2017 04:52:11 +0000 (04:52 +0000)
committer Andrew Solomon <asolomon@us.ibm.com>
Wed, 31 May 2017 16:52:47 +0000 (16:52 +0000)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 746c4d973285ade10783005adc49f8416590de8c..649a27944ac583939954463d2243c2267a0d8dcb 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -471,6 +471,7 @@ set(libcommon_files
    ${async_rdma_common_srcs}
    ${dpdk_common_srcs}
    msg/msg_types.cc
+  common/reverse.c
    common/hobject.cc
    osd/OSDMap.cc
    osd/OSDMapMapping.cc
@@ -544,7 +545,8 @@ if(HAVE_INTEL)
  elseif(HAVE_POWER8)
    list(APPEND libcommon_files
      common/crc32c_ppc.c
-    common/crc32c_ppc_asm.S)
+    common/crc32c_ppc_asm.S
+    common/crc32c_ppc_fast_zero_asm.S)
  endif(HAVE_INTEL)
  
  if(LINUX)
diff --git a/src/common/crc32c_ppc.c b/src/common/crc32c_ppc.c

index e113ad8e1a6ae1d19a1bd0474351f1077d1071c9..43756e24ef81ce180841041a48ad5fe35b31a6b7 100644 (file)
--- a/src/common/crc32c_ppc.c
+++ b/src/common/crc32c_ppc.c
@@ -7,9 +7,12 @@
   * 2 of the License, or (at your option) any later version.
   */
  #define CRC_TABLE
+#define FAST_ZERO_TABLE
+
  #include "acconfig.h"
  #include "include/int_types.h"
  #include "crc32c_ppc_constants.h"
+#include "reverse.h"
  
  #include <stdlib.h>
  #include <strings.h>
@@ -35,8 +38,38 @@ static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
  }
  #endif
  
-
  #ifdef HAVE_POWER8
+static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
+        vector unsigned int va = {a, 0, 0, 0};
+        vector unsigned int vb = {b, 0, 0, 0};
+        vector unsigned long vt;
+
+        __asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
+
+        return vt[0];
+}
+
+unsigned int barrett_reduction(unsigned long val);
+
+static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
+        return barrett_reduction(polynomial_multiply(a, b));
+}
+
+unsigned int append_zeros(unsigned int crc, unsigned long length) {
+        unsigned long i = 0;
+
+        while (length) {
+                if (length & 1) {
+                        crc = gf_multiply(crc, crc_zero[i]);
+                }
+                i++;
+                length /= 2;
+        }
+
+        return crc;
+}
+
+
  unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
                              unsigned long len);
  
@@ -79,19 +112,23 @@ out:
  }
  
  /* This wrapper function works around the fact that crc32_vpmsum 
- * does not gracefully handle the case where the data pointer is NULL.  There
- * may be room for performance improvement here.
+ * does not gracefully handle the case where the data pointer is NULL.
   */
  uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
  {
-  unsigned char *buf2;
-
    if (!data) {
-    buf2 = malloc(len);
-    bzero(buf2, len);
-    crc = crc32_vpmsum(crc, buf2, len);
-    free(buf2);
+    /* Handle the NULL buffer case. */
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
+
+    crc = append_zeros(crc, len);
+
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
    } else {
+    /* Handle the valid buffer case. */
      crc = crc32_vpmsum(crc, data, (unsigned long)len);
    }
    return crc;
diff --git a/src/common/crc32c_ppc_constants.h b/src/common/crc32c_ppc_constants.h

index 25864f1045efdb5c38b1d6c82a1c8272f0d11936..12a1e1d51fa2a899b7ecd2c084d3cfaa623f4406 100644 (file)
--- a/src/common/crc32c_ppc_constants.h
+++ b/src/common/crc32c_ppc_constants.h
@@ -78,6 +78,77 @@ static const unsigned int crc_table[] = {
         0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
  
  #endif
+
+#ifdef FAST_ZERO_TABLE
+/* fast zero table */
+unsigned int crc_zero[] = {
+       0x100,
+       0x10000,
+       0x1edc6f41,
+       0x3aab4576,
+       0x18571d18,
+       0x59a3508a,
+       0xaa97d41d,
+       0xe78dbf1d,
+       0x4ef6a711,
+       0x2506c32e,
+       0x68d4e827,
+       0x546ea6b0,
+       0x465cebac,
+       0x26a86214,
+       0x964aa2fd,
+       0x3b4c5747,
+       0x6702ee7f,
+       0xd086629f,
+       0xf1f2043c,
+       0xc761a1ca,
+       0xa8964e9a,
+       0x90cab2ce,
+       0xc6e3583d,
+       0x3344e0be,
+       0x7d53914b,
+       0x3d953297,
+       0xfcf2eda0,
+       0x42f878a5,
+       0x2,
+       0x4,
+       0x10,
+       0x100,
+       0x10000,
+       0x1edc6f41,
+       0x3aab4576,
+       0x18571d18,
+       0x59a3508a,
+       0xaa97d41d,
+       0xe78dbf1d,
+       0x4ef6a711,
+       0x2506c32e,
+       0x68d4e827,
+       0x546ea6b0,
+       0x465cebac,
+       0x26a86214,
+       0x964aa2fd,
+       0x3b4c5747,
+       0x6702ee7f,
+       0xd086629f,
+       0xf1f2043c,
+       0xc761a1ca,
+       0xa8964e9a,
+       0x90cab2ce,
+       0xc6e3583d,
+       0x3344e0be,
+       0x7d53914b,
+       0x3d953297,
+       0xfcf2eda0,
+       0x42f878a5,
+       0x2,
+       0x4,
+       0x10,
+       0x100,
+       0x10000
+};
+#endif
+
  #else
  #define MAX_SIZE       32768
  .constants:
diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S

new file mode 100644 (file)

index 0000000..a53df1d
--- /dev/null
+++ b/src/common/crc32c_ppc_fast_zero_asm.S
@@ -0,0 +1,77 @@
+/*
+ * Use the fixed point version of Barrett reduction to compute a mod n
+ * over GF(2) for given n using POWER8 instructions. We use k = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+#include <ppc-asm.h>
+#include "common/ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+       .section        .data
+.balign 16
+
+.barrett_fz_constants:
+       /* Barrett constant m - (4^32)/n */
+       .octa 0x0000000000000000000000011f91caf6        /* x^64 div p(x) */
+       /* Barrett constant n */
+       .octa 0x0000000000000000000000011edc6f41
+
+.text
+/* unsigned int barrett_reduction(unsigned long val) */
+FUNC_START(barrett_reduction)
+       addis   r4,r2,.barrett_fz_constants@toc@ha
+       addi    r4,r4,.barrett_fz_constants@toc@l
+
+       li      r5,16
+       vxor    v1,v1,v1        /* zero v1 */
+
+       /* Get a into v0 */
+       MTVRD(v0, r3)
+       vsldoi  v0,v1,v0,8      /* shift into bottom 64 bits, this is a */
+
+       /* Load constants */
+       lvx     v2,0,r4         /* m */
+       lvx     v3,r5,r4        /* n */
+
+       /*
+        * Now for the actual algorithm. The idea is to calculate q,
+        * the multiple of our polynomial that we need to subtract. By
+        * doing the computation 2x bits higher (ie 64 bits) and shifting the
+        * result back down 2x bits, we round down to the nearest multiple.
+        */
+       VPMSUMD(v4,v0,v2)       /* ma */
+       vsldoi  v4,v1,v4,8      /* q = floor(ma/(2^64)) */
+       VPMSUMD(v4,v4,v3)       /* qn */
+       vxor    v0,v0,v4        /* a - qn, subtraction is xor in GF(2) */
+
+       /*
+        * Get the result into r3. We need to shift it left 8 bytes:
+        * V0 [ 0 1 2 X ]
+        * V0 [ 0 X 2 3 ]
+        */
+       vsldoi  v0,v0,v1,8      /* shift result into top 64 bits of v0 */
+       MFVRD(r3, v0)
+
+       blr
+FUNC_END(barrett_reduction)
+       
diff --git a/src/common/hobject.h b/src/common/hobject.h

index 258d6a3b42bedcbfcbe06c3b865c075974a64c9d..9b3f38f4494697b85e9f2e01d0c6f30a021166de 100644 (file)
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -21,6 +21,8 @@
  #include "json_spirit/json_spirit_value.h"
  #include "include/assert.h"   // spirit clobbers it!
  
+#include "reverse.h"
+
  namespace ceph {
    class Formatter;
  }
@@ -197,27 +199,10 @@ public:
    }
  
    static uint32_t _reverse_bits(uint32_t v) {
-    if (v == 0)
-      return v;
-    // reverse bits
-    // swap odd and even bits
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    // swap consecutive pairs
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    // swap nibbles ...
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    // swap bytes
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    // swap 2-byte long pairs
-    v = ( v >> 16             ) | ( v               << 16);
-    return v;
+    return reverse_bits(v);
    }
    static uint32_t _reverse_nibbles(uint32_t retval) {
-    // reverse nibbles
-    retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
-    retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
-    retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
-    return retval;
+    return reverse_nibbles(retval);
    }
  
    /**
diff --git a/src/common/reverse.c b/src/common/reverse.c

new file mode 100644 (file)

index 0000000..f65540d
--- /dev/null
+++ b/src/common/reverse.c
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "reverse.h"
+
+uint32_t reverse_bits(uint32_t v) {
+  if (v == 0)
+    return v;
+
+  /* reverse bits
+   * swap odd and even bits
+   */
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  /* swap consecutive pairs */
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  /* swap nibbles ... */
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  /* swap bytes */
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  /* swap 2-byte long pairs */
+  v = ( v >> 16             ) | ( v               << 16);                                
+  return v;
+}
+
+uint32_t reverse_nibbles(uint32_t retval) {
+  /* reverse nibbles */
+  retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
+  retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
+  retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
+  return retval;
+}
diff --git a/src/common/reverse.h b/src/common/reverse.h

new file mode 100644 (file)

index 0000000..9a199a8
--- /dev/null
+++ b/src/common/reverse.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CEPH_OS_REVERSE_H
+#define __CEPH_OS_REVERSE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t reverse_bits(uint32_t v);
+extern uint32_t reverse_nibbles(uint32_t retval);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif    
diff --git a/src/include/crc32c.h b/src/include/crc32c.h

index 86d9c8d229caaeac57808b567b4bbae2d94bf394..dd4ede666ec1dca7cb346615aaa955a510fb1a98 100644 (file)
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -20,7 +20,10 @@ extern ceph_crc32c_func_t ceph_choose_crc32(void);
  /**
   * calculate crc32c for data that is entirely 0 (ZERO)
   *
- * Note: works the same as \ref ceph_crc32c for data == nullptr, but faster
+ * Note: works the same as ceph_crc32c_func for data == nullptr, 
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as 
+ * ppc64le optimized assembly.  
   *
   * @param crc initial value
   * @param length length of buffer
@@ -39,9 +42,12 @@ uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
   */
  static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
  {
+#ifndef HAVE_POWER8
    if (!data && length > 16)
      return ceph_crc32c_zeros(crc, length);
-       return ceph_crc32c_func(crc, data, length);
+#endif /* HAVE_POWER8 */
+
+  return ceph_crc32c_func(crc, data, length);
  }
  
  #ifdef __cplusplus
diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc

index c51006732e85c3e355d0ac8ba7243bae6a53d25f..7071728bb468d6ac3c468802cceb898f162390e6 100644 (file)
--- a/src/test/common/test_crc32c.cc
+++ b/src/test/common/test_crc32c.cc
@@ -319,12 +319,23 @@ TEST(Crc32c, zeros_performance_compare) {
  
      pre_start = ceph_clock_now();
      start = ceph_clock_now();
+#ifdef HAVE_POWER8
+    uint32_t crc_b = ceph_crc32c_zeros(111, size);
+#else
      uint32_t crc_b = ceph_crc32c_func(111, nullptr, size);
+#endif
      end = ceph_clock_now();
      time_adjusted = (end - start) - (start - pre_start);
+#ifdef HAVE_POWER8
+    std::cout << "ceph_crc32c_zeros method. size=" << size << " time=" 
+        << (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted) 
+        << " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%" 
+        << std::endl;
+#else
      std::cout << "fallback method. size=" << size << " time=" << (double)(end-start)
          << " at " << (double)size/(1024*1024)/(time_adjusted) << " MB/sec"
          << " error=" << resolution / time_adjusted * 100 << "%" << std::endl;
+#endif
      EXPECT_EQ(crc_a, crc_b);
    }
  }
@@ -336,10 +347,12 @@ TEST(Crc32c, zeros_performance) {
  
    start = ceph_clock_now();
    for (size_t i=0; i<ITER; i++)
-  for (size_t scale=1; scale < 31; scale++)
    {
-    size_t size = (1<<scale) + rand() % (1<<scale);
-    ceph_crc32c(rand(), nullptr, size);
+    for (size_t scale=1; scale < 31; scale++)
+    {
+      size_t size = (1<<scale) + rand() % (1<<scale);
+      ceph_crc32c(rand(), nullptr, size);
+    }
    }
    end = ceph_clock_now();
    std::cout << "iterations="<< ITER*31 << " time=" << (double)(end-start) << std::endl;
author	Andrew Solomon <asolomon@us.ibm.com>
	Sun, 14 May 2017 04:52:11 +0000 (04:52 +0000)
committer	Andrew Solomon <asolomon@us.ibm.com>
	Wed, 31 May 2017 16:52:47 +0000 (16:52 +0000)
src/CMakeLists.txt		patch \| blob \| history
src/common/crc32c_ppc.c		patch \| blob \| history
src/common/crc32c_ppc_constants.h		patch \| blob \| history
src/common/crc32c_ppc_fast_zero_asm.S	[new file with mode: 0644]	patch \| blob
src/common/hobject.h		patch \| blob \| history
src/common/reverse.c	[new file with mode: 0644]	patch \| blob
src/common/reverse.h	[new file with mode: 0644]	patch \| blob
src/include/crc32c.h		patch \| blob \| history
src/test/common/test_crc32c.cc		patch \| blob \| history