From: Tyler Stachecki Date: Sun, 5 May 2024 14:37:34 +0000 (-0400) Subject: common: Leverage a better CRC32C implementation X-Git-Tag: testing/wip-rishabh-testing-20240628.135345-debug~10^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=948392a41511f5a04b13a8bad43ddb6d2731a197;p=ceph-ci.git common: Leverage a better CRC32C implementation ISA-L provides a few different CRC32C implementations, of which Ceph has only ever linked against one (crc32_iscsi_00). The second implementation of CRC32C provided by ISA-L (crc32_iscsi_01) improves upon the first as it is used by Ceph in a couple of ways: 1) crc32_iscsi_01 explicitly handles and checks for < 8 byte buffers and computes the CRC32C value using the hardware-accelerated CRC32 instruction. In comparison, crc32_iscsi_00 prefetches too far in cases of small buffers, requiring the Ceph code to explicitly check and handle this case differently in software. This software-fallback implementation of CRC32 also comes with a different set of LUTs (look up tables) and is less efficient as it does not make use of the CRC32 instruction. 2) crc32_iscsi_00 makes use of large LUTs (look up tables) to effectively perform the modular reduction required to produce the CRC32C value. In constrast, crc32_iscsi_01 uses the PCLMUL instruction set to perform reductions 128-bits at a time with smaller LUTs, resulting in greater throughput and less data cache pollution. Fixes: https://tracker.ceph.com/issues/65791 Signed-off-by: Tyler Stachecki --- diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 3deecc3f3d6..3da83cd9ed1 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -215,6 +215,7 @@ if(HAVE_INTEL) set(CMAKE_ASM_FLAGS "-i ${PROJECT_SOURCE_DIR}/src/isa-l/include/ ${CMAKE_ASM_FLAGS}") list(APPEND crc32_srcs ${PROJECT_SOURCE_DIR}/src/isa-l/crc/crc32_iscsi_00.asm + ${PROJECT_SOURCE_DIR}/src/isa-l/crc/crc32_iscsi_01.asm crc32c_intel_fast_zero_asm.s) endif(HAVE_NASM_X64) elseif(HAVE_POWER8) diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc index e4a77ae99ae..2793432fdd4 100644 --- a/src/common/crc32c.cc +++ b/src/common/crc32c.cc @@ -24,6 +24,9 @@ ceph_crc32c_func_t ceph_choose_crc32(void) // use that. #if defined(__i386__) || defined(__x86_64__) if (ceph_arch_intel_sse42 && ceph_crc32c_intel_fast_exists()) { + if (ceph_arch_intel_pclmul) { + return ceph_crc32c_intel_fast_pclmul; + } return ceph_crc32c_intel_fast; } #elif defined(__arm__) || defined(__aarch64__) diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c index 28bd9341651..3fbb63e2812 100644 --- a/src/common/crc32c_intel_fast.c +++ b/src/common/crc32c_intel_fast.c @@ -2,10 +2,25 @@ #include "common/crc32c_intel_baseline.h" extern unsigned int crc32_iscsi_00(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_00"); +extern unsigned int crc32_iscsi_01(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_01"); extern unsigned int crc32_iscsi_zero_00(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_zero_00"); #ifdef HAVE_NASM_X64 +uint32_t ceph_crc32c_intel_fast_pclmul(uint32_t crc, unsigned char const *buffer, unsigned len) +{ + if (!buffer) + { + return crc32_iscsi_zero_00(buffer, len, crc); + } + + /* Unlike crc32_iscsi_00, crc32_iscsi_01 handles the case where the + * input buffer is less than 8 bytes in its prelude, and does not + * prefetch beyond said buffer. + */ + return crc32_iscsi_01(buffer, len, crc); +} + uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len) { uint32_t v; @@ -43,6 +58,11 @@ int ceph_crc32c_intel_fast_exists(void) return 0; } +uint32_t ceph_crc32c_intel_fast_pclmul(uint32_t crc, unsigned char const *buffer, unsigned len) +{ + return 0; +} + uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len) { return 0; diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h index 26a444f6061..81c6e494f0c 100644 --- a/src/common/crc32c_intel_fast.h +++ b/src/common/crc32c_intel_fast.h @@ -10,10 +10,16 @@ extern int ceph_crc32c_intel_fast_exists(void); #ifdef __x86_64__ +extern uint32_t ceph_crc32c_intel_fast_pclmul(uint32_t crc, unsigned char const *buffer, unsigned len); extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len); #else +static inline uint32_t ceph_crc32c_intel_fast_pclmul(uint32_t crc, unsigned char const *buffer, unsigned len) +{ + return 0; +} + static inline uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len) { return 0;