From 66b2e34d368d0bc9d03e6a5dc666159815d800bf Mon Sep 17 00:00:00 2001 From: chenxuqiang Date: Thu, 18 May 2023 14:34:19 +0800 Subject: [PATCH] erasure-code/isa/xor_op: add neon-based region_xor implementation The load instruction of NEON can load 128 bits. Generally, the CPU has two load channels. Therefore, the 32-byte Region_xor can be implemented. According to the test by ceph_erasure_code_benchmark, the performance is improved by more than 20% ~ 50% on average. loop = 10000 (k, m, size) | base(s) | neon(s) ------------------------------------------ (4, 1, 16384) | 0.018 | 0.015 ------------------------------------------ (4, 1, 65536) | 0.043 | 0.037 ------------------------------------------ (4, 1, 102400) | 0.058 | 0.049 ------------------------------------------ (8, 1, 32768) | 0.034 | 0.029 ------------------------------------------ (8, 1, 65536) | 0.052 | 0.045 ------------------------------------------ (8, 1, 102400) | 0.068 | 0.061 ------------------------------------------ (8, 1, 524288) | 0.631 | 0.420 ------------------------------------------ (8, 1, 1048576) | 1.561 | 0.931 ------------------------------------------ (8, 1, 8388608) | 16.70 | 8.244 ------------------------------------------ Signed-off-by: chenxuqiang --- src/erasure-code/isa/xor_op.cc | 54 ++++++++++++++++++++++++++++++++++ src/erasure-code/isa/xor_op.h | 11 ++++++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/erasure-code/isa/xor_op.cc b/src/erasure-code/isa/xor_op.cc index 2b56e977c7f..4f507c9b262 100644 --- a/src/erasure-code/isa/xor_op.cc +++ b/src/erasure-code/isa/xor_op.cc @@ -15,6 +15,11 @@ #include #include #include "arch/intel.h" +#include "arch/arm.h" + +#if defined(__aarch64__) && defined(__ARM_NEON) + #include +#endif #include "include/ceph_assert.h" @@ -101,6 +106,16 @@ region_xor(unsigned char** src, // 64-byte region xor region_sse2_xor((char**) src, (char*) parity, src_size, region_size); } else +#elif defined (__aarch64__) && defined(__ARM_NEON) + if (ceph_arch_neon) { + // ----------------------------- + // use NEON region xor function + // ----------------------------- + unsigned region_size = + (size / EC_ISA_VECTOR_NEON_WORDSIZE) * EC_ISA_VECTOR_NEON_WORDSIZE; + size_left -= region_size; + region_neon_xor((char**) src, (char *) parity, src_size, region_size); + } else #endif { // -------------------------------------------- @@ -181,3 +196,42 @@ region_sse2_xor(char** src, #endif // __x86_64__ return; } + +void +// ----------------------------------------------------------------------------- +region_neon_xor(char **src, + char *parity, + int src_size, + unsigned size) +// ----------------------------------------------------------------------------- +{ +#if defined(__aarch64__) && defined(__ARM_NEON) + ceph_assert(!(size % EC_ISA_VECTOR_NEON_WORDSIZE)); + unsigned char *p = (unsigned char *)parity; + unsigned char *vbuf[256] = { NULL }; + for (int v = 0; v < src_size; v++) { + vbuf[v] = (unsigned char *)src[v]; + } + + // ---------------------------------------------------------------------------------------- + // NEON load instructions can load 128bits of data each time, and there are 2 load channels + // ---------------------------------------------------------------------------------------- + for (unsigned i = 0; i < size; i += EC_ISA_VECTOR_NEON_WORDSIZE) { + uint64x2_t d0_1 = vld1q_u64((uint64_t *)(&(vbuf[0][i]))); + uint64x2_t d0_2 = vld1q_u64((uint64_t *)(&(vbuf[0][i + 16]))); + + for (int d = 1; d < src_size; d++) { + uint64x2_t di_1 = vld1q_u64((uint64_t *)(&(vbuf[d][i]))); + uint64x2_t di_2 = vld1q_u64((uint64_t *)(&(vbuf[d][i + 16]))); + + d0_1 = veorq_u64(d0_1, di_1); + d0_2 = veorq_u64(d0_2, di_2); + } + + vst1q_u64((uint64_t *)p, d0_1); + vst1q_u64((uint64_t *)(p + 16), d0_2); + p += EC_ISA_VECTOR_NEON_WORDSIZE; + } +#endif // __aarch64__ && __ARM_NEON + return; +} diff --git a/src/erasure-code/isa/xor_op.h b/src/erasure-code/isa/xor_op.h index 978b9a95358..46304eee1cc 100644 --- a/src/erasure-code/isa/xor_op.h +++ b/src/erasure-code/isa/xor_op.h @@ -27,7 +27,7 @@ #define EC_ISA_ADDRESS_ALIGNMENT 32u #define EC_ISA_VECTOR_SSE2_WORDSIZE 64u - +#define EC_ISA_VECTOR_NEON_WORDSIZE 32u #if __GNUC__ > 4 || \ ( (__GNUC__ == 4) && (__GNUC_MINOR__ >= 4) ) ||\ (__clang__ == 1 ) @@ -83,5 +83,14 @@ region_sse2_xor(char** src /* array of 64-byte aligned source pointer to xor */, int src_size /* size of the source pointer array */, unsigned size /* size of the region to xor */); +// ------------------------------------------------------------------------- +// compute region XOR like parity = src[0] ^ src[1] ... ^ src[src_size-1] +// using NEON 32-byte operations +// ------------------------------------------------------------------------- +void +region_neon_xor(char** src /* array of 64-byte aligned source pointer to xor */, + char* parity /* 32-byte aligned output pointer containing the parity */, + int src_size /* size of the source pointer array */, + unsigned size /* size of the region to xor */); #endif // EC_ISA_XOR_OP_H -- 2.39.5