]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crc32c: optimize aarch64 crc32c implementation 12977/head
authorwei xiao <wei.xiao@linaro.org>
Fri, 13 Jan 2017 04:36:39 +0000 (04:36 +0000)
committerwei xiao <wei.xiao@linaro.org>
Sat, 4 Feb 2017 06:20:50 +0000 (06:20 +0000)
ARMv8 defines PMULL crypto instruction.
This patch optimizes crc32c calculate with the instruction when
available rather than original linear crc instructions.

ceph crc32c performance unit test shows that the optimization get
~ x3.90 speedups on ThunderX ARM Core@2.0GHz (Cavium)
~ x1.45 speedups on ARM Cortex-A57@2.1GHz (Huaiwei)
~ x1.16 speedups on ARM Cortex-A57@2.0GHz (Softiron)

Jira: ENTLLT-358
Change-Id: I657422cd20c9ca78237cd060210a5383f4122575
Signed-off-by: wei xiao <wei.xiao@linaro.org>
cmake/modules/SIMDExt.cmake
src/CMakeLists.txt
src/common/crc32c_aarch64.c
src/include/config-h.in.cmake

index a984b51e689d46ddaff1e97c43481f1a69a0ca2d..a879ef3470fecc40402440f18d959be26cfbc57b 100644 (file)
@@ -27,22 +27,37 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
       return ret;
     }
     int main() { foo(0); }" HAVE_ARMV8_CRC)
+    check_cxx_source_compiles("
+    asm(\".arch_extension crypto\");
+    unsigned int foo(unsigned int ret) {
+      __asm__(\"pmull  v2.1q,          v2.1d,  v1.1d\");
+      return ret;
+    }
+    int main() { foo(0); }" HAVE_ARMV8_CRYPTO)
   set(CMAKE_REQUIRED_QUIET ${save_quiet})
   if(HAVE_ARMV8_CRC)
     message(STATUS " aarch64 crc extensions supported")
   endif()
+  if(HAVE_ARMV8_CRYPTO)
+    message(STATUS " aarch64 crypto extensions supported")
+  endif()
+  CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+  if(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+    message(STATUS " aarch64 crc+crypto intrinsics supported")
+    set(ARMV8_CRC_COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS} -march=armv8-a+crc+crypto")
+  endif()
   CHECK_C_COMPILER_FLAG(-march=armv8-a+simd HAVE_ARMV8_SIMD)
   if(HAVE_ARMV8_SIMD)
     set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -march=armv8-a+simd")
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|ARM")
-  set(HAVE_ARM 1) 
+  set(HAVE_ARM 1)
   CHECK_C_COMPILER_FLAG(-mfpu=neon HAVE_ARM_NEON)
   if(HAVE_ARM_NEON)
     set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -mfpu=neon")
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|amd64|x86_64|AMD64")
-  set(HAVE_INTEL 1) 
+  set(HAVE_INTEL 1)
   CHECK_C_COMPILER_FLAG(-msse HAVE_INTEL_SSE)
   if(HAVE_INTEL_SSE)
     set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -msse")
index 5565268ba94c3562a1dc6e65cd828f04ebabeea9..b48718dfc7b1965bd36b3f8cb3ad27d0277b0a22 100644 (file)
@@ -578,6 +578,7 @@ set_source_files_properties(${CMAKE_SOURCE_DIR}/src/ceph_ver.c
 include(SIMDExt)
 if(HAVE_ARMV8_CRC)
   add_library(common_crc_aarch64 STATIC common/crc32c_aarch64.c)
+  set_target_properties(common_crc_aarch64 PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} ${ARMV8_CRC_COMPILE_FLAGS}")
   list(APPEND ceph_common_deps common_crc_aarch64)
 endif(HAVE_ARMV8_CRC)
 
index d2be6ddd85a76b329e4d2ee6a5eb1ef7136ffb1b..e4fbf23a7602a73846a9dc13c8043a42e398bd98 100644 (file)
 #include "include/int_types.h"
 #include "common/crc32c_aarch64.h"
 
+#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
 /* Request crc extension capabilities from the assembler */
 asm(".arch_extension crc");
 
+#ifdef HAVE_ARMV8_CRYPTO
+/* Request crypto extension capabilities from the assembler */
+asm(".arch_extension crypto");
+#endif
+
 #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
 #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
 #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
 
+#define CRC32C3X8(ITR) \
+       __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+       __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+       __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
+
+#define CRC32C3X8_ZERO \
+       __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
+#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
+#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
+#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
+
+#define CRC32C3X8(ITR) \
+       crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+       crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+       crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
+
+#define CRC32C3X8_ZERO \
+       crc0 = __crc32cd(crc0, (const uint64_t)0);
+
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#define CRC32C7X3X8(ITR) do {\
+       CRC32C3X8((ITR)*7+0) \
+       CRC32C3X8((ITR)*7+1) \
+       CRC32C3X8((ITR)*7+2) \
+       CRC32C3X8((ITR)*7+3) \
+       CRC32C3X8((ITR)*7+4) \
+       CRC32C3X8((ITR)*7+5) \
+       CRC32C3X8((ITR)*7+6) \
+       } while(0)
+
+#define CRC32C7X3X8_ZERO do {\
+       CRC32C3X8_ZERO \
+       CRC32C3X8_ZERO \
+       CRC32C3X8_ZERO \
+       CRC32C3X8_ZERO \
+       CRC32C3X8_ZERO \
+       CRC32C3X8_ZERO \
+       CRC32C3X8_ZERO \
+       } while(0)
+
+#define PREF4X64L1(PREF_OFFSET, ITR) \
+       __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+       __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+       __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+       __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(PREF_OFFSET) \
+       PREF4X64L1((PREF_OFFSET), 0) \
+       PREF4X64L1((PREF_OFFSET), 4) \
+       PREF4X64L1((PREF_OFFSET), 8) \
+       PREF4X64L1((PREF_OFFSET), 12)
+
+#define PREF4X64L2(PREF_OFFSET, ITR) \
+       __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+       __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+       __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+       __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL2(PREF_OFFSET) \
+       PREF4X64L2((PREF_OFFSET), 0) \
+       PREF4X64L2((PREF_OFFSET), 4) \
+       PREF4X64L2((PREF_OFFSET), 8) \
+       PREF4X64L2((PREF_OFFSET), 12)
+
+
 uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
 {
        int64_t length = len;
+       uint32_t crc0, crc1, crc2;
 
-       if (!buffer) {
+       if (buffer) {
+#ifdef HAVE_ARMV8_CRYPTO
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+               /* Calculate reflected crc with PMULL Instruction */
+               const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+               uint64_t t0, t1;
 
-               while ((length -= sizeof(uint64_t)) >= 0)
-                       CRC32CX(crc, 0);
+               /* crc done "by 3" for fixed input block size of 1024 bytes */
+               while ((length -= 1024) >= 0) {
+                       /* Prefetch data for following block to avoid cache miss */
+                       PREF1KL2(1024*3);
+                       /* Do first 8 bytes here for better pipelining */
+                       crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
+                       crc1 = 0;
+                       crc2 = 0;
+                       buffer += sizeof(uint64_t);
 
-               /* The following is more efficient than the straight loop */
-               if (length & sizeof(uint32_t))
-                       CRC32CW(crc, 0);
+                       /* Process block inline
+                       Process crc0 last to avoid dependency with above */
+                       CRC32C7X3X8(0);
+                       CRC32C7X3X8(1);
+                       CRC32C7X3X8(2);
+                       CRC32C7X3X8(3);
+                       CRC32C7X3X8(4);
+                       CRC32C7X3X8(5);
 
-               if (length & sizeof(uint16_t))
-                       CRC32CH(crc, 0);
+                       buffer += 42*3*sizeof(uint64_t);
+                       /* Prefetch data for following block to avoid cache miss */
+                       PREF1KL1(1024);
 
-               if (length & sizeof(uint8_t))
-                       CRC32CB(crc, 0);
-       } else {
+                       /* Merge crc0 and crc1 into crc2
+                          crc1 multiply by K2
+                          crc0 multiply by K1 */
+
+                       t1 = (uint64_t)vmull_p64(crc1, k2);
+                       t0 = (uint64_t)vmull_p64(crc0, k1);
+                       crc = __crc32cd(crc2, *(const uint64_t *)buffer);
+                       crc1 = __crc32cd(0, t1);
+                       crc ^= crc1;
+                       crc0 = __crc32cd(0, t0);
+                       crc ^= crc0;
+
+                       buffer += sizeof(uint64_t);
+               }
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+               __asm__("mov    x16,            #0xf38a         \n\t"
+                       "movk   x16,            #0xe417, lsl 16 \n\t"
+                       "mov    v1.2d[0],       x16             \n\t"
+                       "mov    x16,            #0x8014         \n\t"
+                       "movk   x16,            #0x8f15, lsl 16 \n\t"
+                       "mov    v0.2d[0],       x16             \n\t"
+                       :::"x16");
+
+               while ((length -= 1024) >= 0) {
+                       PREF1KL2(1024*3);
+                       __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+                               :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+                       crc1 = 0;
+                       crc2 = 0;
+                       buffer += sizeof(uint64_t);
+
+                       CRC32C7X3X8(0);
+                       CRC32C7X3X8(1);
+                       CRC32C7X3X8(2);
+                       CRC32C7X3X8(3);
+                       CRC32C7X3X8(4);
+                       CRC32C7X3X8(5);
+
+                       buffer += 42*3*sizeof(uint64_t);
+                       PREF1KL1(1024);
+                       __asm__("mov            v2.2d[0],       %x[c1]          \n\t"
+                               "pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
+                               "mov            v3.2d[0],       %x[c0]          \n\t"
+                               "pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+                               "crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
+                               "mov            %x[c1],         v2.2d[0]        \n\t"
+                               "crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
+                               "eor            %w[c],          %w[c],  %w[c1]  \n\t"
+                               "mov            %x[c0],         v3.2d[0]        \n\t"
+                               "crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
+                               "eor            %w[c],          %w[c],  %w[c0]  \n\t"
+                               :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+                               :[v]"r"(*((const uint64_t *)buffer)));
+                       buffer += sizeof(uint64_t);
+               }
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+               if(!(length += 1024))
+                       return crc;
+
+#endif /* HAVE_ARMV8_CRYPTO */
                while ((length -= sizeof(uint64_t)) >= 0) {
                        CRC32CX(crc, *(uint64_t *)buffer);
                        buffer += sizeof(uint64_t);
@@ -45,6 +201,69 @@ uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned
                }
                if (length & sizeof(uint8_t))
                        CRC32CB(crc, *buffer);
+       } else {
+#ifdef HAVE_ARMV8_CRYPTO
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+               const poly64_t k1 = 0xe417f38a;
+               uint64_t t0;
+
+               while ((length -= 1024) >= 0) {
+                       crc0 = __crc32cd(crc, 0);
+
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+
+                       /* Merge crc0 into crc: crc0 multiply by K1 */
+
+                       t0 = (uint64_t)vmull_p64(crc0, k1);
+                       crc = __crc32cd(0, t0);
+               }
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+               __asm__("mov    x16,            #0xf38a         \n\t"
+                       "movk   x16,            #0xe417, lsl 16 \n\t"
+                       "mov    v1.2d[0],       x16             \n\t"
+                       :::"x16");
+
+               while ((length -= 1024) >= 0) {
+                       __asm__("crc32cx %w[c0], %w[c], xzr\n\t"
+                               :[c0]"=r"(crc0):[c]"r"(crc));
+
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+                       CRC32C7X3X8_ZERO;
+
+                       __asm__("mov            v3.2d[0],       %x[c0]          \n\t"
+                               "pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+                               "mov            %x[c0],         v3.2d[0]        \n\t"
+                               "crc32cx        %w[c],          wzr,    %x[c0]  \n\t"
+                               :[c]"=r"(crc)
+                               :[c0]"r"(crc0));
+               }
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+               if(!(length += 1024))
+                       return crc;
+
+#endif /* HAVE_ARMV8_CRYPTO */
+               while ((length -= sizeof(uint64_t)) >= 0)
+                       CRC32CX(crc, 0);
+
+               /* The following is more efficient than the straight loop */
+               if (length & sizeof(uint32_t))
+                       CRC32CW(crc, 0);
+
+               if (length & sizeof(uint16_t))
+                       CRC32CH(crc, 0);
+
+               if (length & sizeof(uint8_t))
+                       CRC32CB(crc, 0);
        }
        return crc;
 }
index d731a9b6c1f0e4c9caf42db216b7ff97572d8cdc..77065c9dae1f17c1c8c4795d4878966feb505805 100644 (file)
 /* Support ARMv8 CRC instructions */
 #cmakedefine HAVE_ARMV8_CRC
 
+/* Support ARMv8 CRYPTO instructions */
+#cmakedefine HAVE_ARMV8_CRYPTO
+
+/* Support ARMv8 CRC and CRYPTO intrinsics */
+#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
 /* Define if you have struct stat.st_mtimespec.tv_nsec */
 #cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC