src/common: optimize crc32c using zbc extension for riscv64

author WenLei <lei.wen2@zte.com.cn>

Wed, 1 Apr 2026 07:29:31 +0000 (15:29 +0800)

committer WenLei <lei.wen2@zte.com.cn>

Tue, 7 Apr 2026 09:25:48 +0000 (17:25 +0800)
author WenLei <lei.wen2@zte.com.cn>
Wed, 1 Apr 2026 07:29:31 +0000 (15:29 +0800)
committer WenLei <lei.wen2@zte.com.cn>
Tue, 7 Apr 2026 09:25:48 +0000 (17:25 +0800)
diff --git a/cmake/modules/SIMDExt.cmake b/cmake/modules/SIMDExt.cmake

index d72998b1ac6b2ad8c2249fe73f6978656e034ef1..dfa7ffb7f0c6b24ec198f00f38ff3aac8bdd83ef 100644 (file)
--- a/cmake/modules/SIMDExt.cmake
+++ b/cmake/modules/SIMDExt.cmake
@@ -113,22 +113,34 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64|RISCV64")
    set(HAVE_RISCV 1)
    include(CheckCCompilerFlag)
  
+  CHECK_C_COMPILER_FLAG("-march=rv64gc_zbc" HAVE_RISCV_ZBC)
+  if(HAVE_RISCV_ZBC)
+    set(HAVE_RISCV_ZBC TRUE)
+    message(STATUS " RISC-V Extension: Zbc detected (scalar crypto)")
+  endif()
+
    CHECK_C_COMPILER_FLAG("-march=rv64gcv_zbc_zvbc" HAVE_RISCV_ZVBC)
    if(HAVE_RISCV_ZVBC)
-    set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -march=rv64gcv_zbc_zvbc")
      set(HAVE_RISCV_RVV TRUE)
      set(HAVE_RISCV_ZVBC TRUE)
-    message(STATUS " RISC-V Extension: Vector + Zbc + Zvbc detected (Best for CRC32)")
+    message(STATUS " RISC-V Extension: Zvbc detected (vector crypto)")
    else()
      CHECK_C_COMPILER_FLAG("-march=rv64gcv" HAVE_RISCV_RVV_ONLY)
      if(HAVE_RISCV_RVV_ONLY)
-      set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -march=rv64gcv")
        set(HAVE_RISCV_RVV TRUE)
        message(STATUS " RISC-V Extension: Standard Vector (rv64gcv) detected")
-    else()
-       message(WARNING " RISC-V Vector extension NOT detected by compiler.")
      endif()
    endif()
+
+  if(HAVE_RISCV_ZVBC)
+    set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -march=rv64gcv_zbc_zvbc")
+  elseif(HAVE_RISCV_ZBC)
+    set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -march=rv64gc_zbc")
+  elseif(HAVE_RISCV_RVV)
+    set(SIMD_COMPILE_FLAGS "${SIMD_COMPILE_FLAGS} -march=rv64gcv")
+  else()
+    message(WARNING " RISC-V crypto/vector extensions NOT detected by compiler.")
+  endif()
  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(s390x|S390X|s390|S390)")
    set(HAVE_S390X 1)
    message(STATUS " we are s390x")
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt

index f25625f389df19f28ccb64e316d89bde0bccad3c..6b18a97ea0aaaed3e17834a3343febfb68111533 100644 (file)
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -255,8 +255,11 @@ elseif(HAVE_S390X)
      crc32c_s390x.c
      crc32c_s390x_le-vx.S
    )
-elseif(HAVE_RISCV_ZVBC)
-  list(APPEND crc32_srcs crc32c_riscv.c)
+elseif(HAVE_RISCV_ZVBC OR HAVE_RISCV_ZBC)
+  list(APPEND crc32_srcs
+    crc32c_riscv.c
+    crc32c_riscv_zbc_asm.S
+  )
  endif(HAVE_INTEL)
  
  add_library(crc32 OBJECT ${crc32_srcs})
@@ -267,6 +270,12 @@ if(HAVE_ARMV8_CRC)
  endif()
  if(HAVE_RISCV)
    set_target_properties(crc32 PROPERTIES COMPILE_FLAGS "${CMAKE_C_FLAGS} ${SIMD_COMPILE_FLAGS}")
+  if(HAVE_RISCV_ZBC)
+    target_compile_definitions(crc32 PRIVATE HAVE_RISCV_ZBC)
+  endif()
+  if(HAVE_RISCV_ZVBC)
+    target_compile_definitions(crc32 PRIVATE HAVE_RISCV_ZVBC)
+  endif()
  endif()
  target_link_libraries(crc32
    arch)
diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc

index 761bd2cd52ab102f09ee976cca272e849650cd54..a0878d4280ccb7a51e5d44cf1495481a075d05d0 100644 (file)
--- a/src/common/crc32c.cc
+++ b/src/common/crc32c.cc
@@ -43,10 +43,17 @@ ceph_crc32c_func_t ceph_choose_crc32(void)
    if (ceph_arch_ppc_crc32) {
      return ceph_crc32c_ppc;
    }
-#elif defined(__riscv) && defined(HAVE_RISCV_ZVBC)
-  if (ceph_arch_riscv_zbc && ceph_arch_riscv_zvbc) {
+#elif defined(__riscv)
+# if defined(HAVE_RISCV_ZVBC)
+  if (ceph_arch_riscv_zvbc) {
      return ceph_crc32c_riscv;
    }
+# endif
+# if defined(HAVE_RISCV_ZBC)
+  if (ceph_arch_riscv_zbc) {
+    return ceph_crc32c_riscv_zbc;
+  }
+# endif
  #elif defined(__s390__)
    if (ceph_arch_s390x_crc32) {
      return ceph_crc32c_s390x;
diff --git a/src/common/crc32c_riscv.c b/src/common/crc32c_riscv.c

index a03e967b32129c8375b04b30d60a9738cca3f2cd..ee79c1d963fa97be5355060eef1056d8d1ec2707 100644 (file)
--- a/src/common/crc32c_riscv.c
+++ b/src/common/crc32c_riscv.c
@@ -11,6 +11,8 @@
  #include "common/sctp_crc32.h"
  #include "common/likely.h"
  
+#if defined(HAVE_RISCV_ZVBC)
+
  // CRC32C  polynomial constants
  #define CRC32C_CONST_0     0xdd45aab8U
  #define CRC32C_CONST_1     0x493c7d27U
@@ -186,3 +188,34 @@ uint32_t ceph_crc32c_riscv(uint32_t crc, unsigned char const *buf, unsigned len)
      }
      return result;
  }
+
+#endif
+
+#if defined(HAVE_RISCV_ZBC)
+
+/* External assembly function implementing CRC32C with carryless multiply */
+extern uint32_t crc32c_zbc(unsigned char const *buf, unsigned len, uint32_t crc);
+
+uint32_t ceph_crc32c_riscv_zbc(uint32_t crc, unsigned char const *buf, unsigned len) {
+    if (!buf) {
+            return ceph_crc32c_sctp(crc, NULL, len);
+    }
+
+    if (len == 0) {
+            return crc;
+    }
+
+    /*
+     * For len < 16, the fold pipeline is never entered. Instead, the assembly
+     * degrades to per-chunk barrett_reduce (via crc32_refl_excess), which
+     * may not outperform the sctp table lookup on all RISC-V microarchitectures.
+     * 16 is the minimum chunk size for the fold-1 path (one 128-bit load pair).
+     */
+    if (len < 16) {
+            return ceph_crc32c_sctp(crc, buf, len);
+    }
+
+    return crc32c_zbc(buf, len, crc);
+}
+
+#endif
diff --git a/src/common/crc32c_riscv.h b/src/common/crc32c_riscv.h

index 092c266113c39bbff26839ed94fc3fa13a367600..a4e506ef5dca136bb40ae3bd262cd9dc62ad4368 100644 (file)
--- a/src/common/crc32c_riscv.h
+++ b/src/common/crc32c_riscv.h
@@ -11,19 +11,20 @@
  
  #include <stdint.h>
  
-#if defined(__riscv) && defined(HAVE_RISCV_ZVBC)
-
  #ifdef __cplusplus
  extern "C" {
  #endif
  
+#if defined(__riscv) && defined(HAVE_RISCV_ZVBC)
  extern uint32_t ceph_crc32c_riscv(uint32_t crc, unsigned char const *buffer, unsigned len);
-
-#ifdef __cplusplus
-}
  #endif
  
+#if defined(__riscv) && defined(HAVE_RISCV_ZBC)
+extern uint32_t ceph_crc32c_riscv_zbc(uint32_t crc, unsigned char const *buffer, unsigned len);
  #endif
  
+#ifdef __cplusplus
+}
  #endif
  
+#endif
diff --git a/src/common/crc32c_riscv_zbc_asm.S b/src/common/crc32c_riscv_zbc_asm.S

new file mode 100644 (file)

index 0000000..9ae4d28
--- /dev/null
+++ b/src/common/crc32c_riscv_zbc_asm.S
@@ -0,0 +1,59 @@
+/* Copyright (C) 2026 ZTE Corporation
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include "crc32c_riscv_zbc_asm.h"
+
+/* uint32_t crc32c_zbc(unsigned char const *buf, unsigned len, uint32_t crc) */
+.text
+.option  arch, +zbc
+.align 1
+.global crc32c_zbc
+.type crc32c_zbc, %function
+crc32c_zbc:
+        /* switch around arguments to match common crc functions */
+        mv t0, a2
+        mv a2, a1
+        mv a1, a0
+        mv a0, t0
+
+        /* load precomputed constants */
+        ld POLY, .Lpoly_refl
+        ld MU, .Lmu
+
+        /* zero-extend seed */
+        slli SEED, SEED, 32
+        srli SEED, SEED, 32
+
+        /* align buffer to 128-bits, then fold */
+        crc32_refl_align
+
+        crc_fold_loop
+        crc32_refl_fold_reduction
+
+        /* handle any remaining excess */
+        crc32_refl_excess
+
+        /* sign-extend result */
+        sext.w SEED, SEED
+        ret
+
+/* precomputed constants */
+.Lpoly_refl:
+        .dword 0x0000000105ec76f1
+.Lmu:
+        .dword 0x4869ec38dea713f1
+.Lk1:
+        .dword 0x00000000740eef02
+.Lk2:
+        .dword 0x000000009e4addf8
+.Lk3:
+        .dword 0x00000000f20c0dfe
+.Lk4:
+        .dword 0x000000014cd00bd6
+.Lk5:
+        .dword 0x00000000dd45aab8
diff --git a/src/common/crc32c_riscv_zbc_asm.h b/src/common/crc32c_riscv_zbc_asm.h

new file mode 100644 (file)

index 0000000..d11ca12
--- /dev/null
+++ b/src/common/crc32c_riscv_zbc_asm.h
@@ -0,0 +1,412 @@
+/* Copyright (C) 2026 ZTE Corporation
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define SEED a0
+#define BUF  a1
+#define LEN  a2
+#define POLY a3
+#define MU   a4
+#define K1   t5
+#define K2   t6
+#define K3   t5
+#define K4   t6
+#define K5   t5
+#define K6   t6
+
+#define X3HIGH t0
+#define HIGH   t0
+#define X3LOW  t1
+#define LOW    t1
+
+#define X2HIGH t2
+#define X2LOW  a5
+#define X1HIGH a6
+#define X1LOW  a7
+#define X0HIGH t3
+#define X0LOW  t4
+
+#define BUF3HIGH s4
+#define BUF3LOW  s5
+#define BUF2HIGH s6
+#define BUF2LOW  s7
+#define BUF1HIGH s8
+#define BUF1LOW  s9
+#define BUF0HIGH s10
+#define BUF0LOW  s11
+
+#define X3K1LOW  ra
+#define X3K2HIGH gp
+#define X2K1LOW  tp
+#define X2K2HIGH s0
+#define X1K1LOW  s1
+#define X1K2HIGH a0
+#define X0K1LOW  s2
+#define X0K2HIGH s3
+
+/* repeated fold-by-four followed by fold-by-one */
+/* expects SEED (a0), BUF (a1) and LEN (a2) to hold those values */
+/* expects BUF is doubleword-aligned */
+/* returns 128-bit result in HIGH:LOW (t0:t1) */
+/* returns updated buffer ptr & length in BUF and LEN */
+/* trashes all caller-saved registers except POLY and MU (a3/a4) */
+.macro crc_fold_loop
+
+        /* does enough buffer exist for a 4-fold? */
+        li t0, 128
+        bltu LEN, t0, .Lfold_1
+
+        /* push callee-saved registers to stack */
+        addi sp, sp, -128
+        sd a3, 120(sp)
+        sd ra, 112(sp)
+        sd gp, 104(sp)
+        sd tp, 96(sp)
+        sd s0, 88(sp)
+        sd s1, 80(sp)
+        sd s2, 72(sp)
+        sd s3, 64(sp)
+        sd s4, 56(sp)
+        sd s5, 48(sp)
+        sd s6, 40(sp)
+        sd s7, 32(sp)
+        sd s8, 24(sp)
+        sd s9, 16(sp)
+        sd s10, 8(sp)
+        sd s11, 0(sp)
+
+        /* load initial 4 128-bit chunks */
+        ld X3HIGH, 0(BUF)
+        ld X3LOW, 8(BUF)
+        ld X2HIGH, 16(BUF)
+        ld X2LOW, 24(BUF)
+        ld X1HIGH, 32(BUF)
+        ld X1LOW, 40(BUF)
+        ld X0HIGH, 48(BUF)
+        ld X0LOW, 56(BUF)
+
+        addi BUF, BUF, 64
+        addi LEN, LEN, -64
+
+        /* xor in seed */
+        xor X3HIGH, X3HIGH, SEED
+
+        /* load constants */
+        ld K1, .Lk1
+        ld K2, .Lk2
+
+        /* calculate how far we'll fold til and load LEN with the amount left */
+        srli a3, LEN, 6
+        slli a3, a3, 6
+        add a3, BUF, a3
+        and LEN, LEN, 0x3f
+
+.align 3
+.Lfold_4_loop:
+        /* carryless multiply each high doubleword by k1, get 128-bit result */
+        /* interleve fetching next 4 128-bit chunks */
+        clmulh X3K1LOW, K1, X3HIGH
+        ld BUF3HIGH, 0(BUF)
+        clmulh X2K1LOW, K1, X2HIGH
+        ld BUF3LOW, 8(BUF)
+        clmulh X1K1LOW, K1, X1HIGH
+        ld BUF2HIGH, 16(BUF)
+        clmulh X0K1LOW, K1, X0HIGH
+        ld BUF2LOW, 24(BUF)
+        clmul X3HIGH, K1, X3HIGH
+        ld BUF1HIGH, 32(BUF)
+        clmul X2HIGH, K1, X2HIGH
+        ld BUF1LOW, 40(BUF)
+        clmul X1HIGH, K1, X1HIGH
+        ld BUF0HIGH, 48(BUF)
+        clmul X0HIGH, K1, X0HIGH
+        ld BUF0LOW, 56(BUF)
+
+        addi BUF, BUF, 64
+
+        /* carryless multiply each low doubleword by k2 */
+        clmul X3K2HIGH, K2, X3LOW
+        clmul X2K2HIGH, K2, X2LOW
+        clmul X1K2HIGH, K2, X1LOW
+        clmul X0K2HIGH, K2, X0LOW
+        clmulh X3LOW, K2, X3LOW
+        clmulh X2LOW, K2, X2LOW
+        clmulh X1LOW, K2, X1LOW
+        clmulh X0LOW, K2, X0LOW
+
+        /* xor results together */
+        xor BUF3LOW, BUF3LOW, X3K1LOW
+        xor BUF2LOW, BUF2LOW, X2K1LOW
+        xor BUF1LOW, BUF1LOW, X1K1LOW
+        xor BUF0LOW, BUF0LOW, X0K1LOW
+        xor X3HIGH, BUF3HIGH, X3HIGH
+        xor X2HIGH, BUF2HIGH, X2HIGH
+        xor X1HIGH, BUF1HIGH, X1HIGH
+        xor X0HIGH, BUF0HIGH, X0HIGH
+        xor X3LOW, X3LOW, BUF3LOW
+        xor X2LOW, X2LOW, BUF2LOW
+        xor X1LOW, X1LOW, BUF1LOW
+        xor X0LOW, X0LOW, BUF0LOW
+        xor X3HIGH, X3K2HIGH, X3HIGH
+        xor X2HIGH, X2K2HIGH, X2HIGH
+        xor X1HIGH, X1K2HIGH, X1HIGH
+        xor X0HIGH, X0K2HIGH, X0HIGH
+
+        bne BUF, a3, .Lfold_4_loop
+
+        /* we've four folded as much as we can, fold-by-one values in regs */
+        /* load fold-by-one constants */
+        ld K3, .Lk3
+        ld K4, .Lk4
+
+        clmul s0, K3, X3HIGH
+        clmulh s1, K3, X3HIGH
+        clmul s2, K4, X3LOW
+        clmulh s3, K4, X3LOW
+        xor HIGH, X2HIGH, s0
+        xor HIGH, HIGH, s2
+        xor LOW, X2LOW, s1
+        xor LOW, LOW, s3
+
+        clmul s0, K3, HIGH
+        clmulh s1, K3, HIGH
+        clmul s2, K4, LOW
+        clmulh s3, K4, LOW
+        xor HIGH, X1HIGH, s0
+        xor HIGH, HIGH, s2
+        xor LOW, X1LOW, s1
+        xor LOW, LOW, s3
+
+        clmul s0, K3, HIGH
+        clmulh s1, K3, HIGH
+        clmul s2, K4, LOW
+        clmulh s3, K4, LOW
+        xor HIGH, X0HIGH, s0
+        xor HIGH, HIGH, s2
+        xor LOW, X0LOW, s1
+        xor LOW, LOW, s3
+
+        /* pop register values saved on stack */
+        ld a3, 120(sp)
+        ld ra, 112(sp)
+        ld gp, 104(sp)
+        ld tp, 96(sp)
+        ld s0, 88(sp)
+        ld s1, 80(sp)
+        ld s2, 72(sp)
+        ld s3, 64(sp)
+        ld s4, 56(sp)
+        ld s5, 48(sp)
+        ld s6, 40(sp)
+        ld s7, 32(sp)
+        ld s8, 24(sp)
+        ld s9, 16(sp)
+        ld s10, 8(sp)
+        ld s11, 0(sp)
+        addi sp, sp, 128
+
+        /* load fold loop constant, check if any more 1-folding to do */
+        li t4, 16
+        bgeu LEN, t4, .Lfold_1_loop
+        /* else jump straight to end */
+        j .Lfold_1_cleanup
+
+.Lfold_1:
+        li t4, 16 /* kept throughout loop */
+        /* handle case where not enough buffer to do any fold */
+        /* .Lfold_1_done must be defined by the crc32/64 fold reduction macro */
+        bltu LEN, t4, .Lfold_1_done
+
+        /* load in initial values and xor with seed */
+        ld HIGH, 0(BUF)
+        xor HIGH, HIGH, SEED
+
+        ld LOW, 8(BUF)
+
+        addi LEN, LEN, -16
+        addi BUF, BUF, 16
+
+        bltu a2, t4, .Lfold_1_cleanup
+
+        /* precomputed constants */
+        ld K3, .Lk3
+        ld K4, .Lk4
+.Lfold_1_loop:
+        /* multiply high and low by constants to get two 128-bit result */
+        clmul t2, K3, HIGH
+        clmulh t3, K3, HIGH
+        clmul a5, K4, LOW
+        clmulh a6, K4, LOW
+
+        /* load next 128-bits of buffer */
+        ld HIGH, 0(BUF)
+        ld LOW, 8(BUF)
+
+        addi LEN, LEN, -16
+        addi BUF, BUF, 16
+
+        /* fold in values with xor */
+        xor HIGH, HIGH, t2
+        xor HIGH, HIGH, a5
+        xor LOW, LOW, t3
+        xor LOW, LOW, a6
+
+        bgeu LEN, t4, .Lfold_1_loop
+
+.Lfold_1_cleanup:
+.endm
+
+/* folding reflected final reduction */
+/* expects 128-bit value in HIGH:LOW (t0:t1), puts return value in SEED (a0) */
+/* trashes t2, t3, a5, a6 and t5, t6 */
+.macro crc32_refl_fold_reduction
+        /* load precalculated constants */
+        ld K4, .Lk4
+        ld K5, .Lk5
+
+        /* fold remaining 128 bits into 96 */
+        clmul t3, K4, t0
+        xor t1, t3, t1
+        clmulh t0, K4, t0
+
+        /* high = (low >> 32) | (high << 32) */
+        slli t0, t0, 32
+        srli t3, t1, 32
+        or t0, t0, t3
+
+        /* fold last 96 bits into 64 */
+        slli t1, t1, 32
+        srli t1, t1, 32
+        clmul t1, K5, t1
+        xor t1, t1, t0
+
+        /* barrett's reduce 64 bits */
+        clmul t0, MU, t1
+        slli t0, t0, 32
+        srli t0, t0, 32
+        clmul t0, POLY, t0
+        xor t0, t1, t0
+        srli SEED, t0, 32
+
+.Lfold_1_done:
+.endm
+
+/* barrett's reduction on a \bits bit-length value, returning result in seed */
+/* bits must be 64, 32, 16 or 8 */
+/* value and seed must be zero-extended */
+.macro barrett_reduce seed:req, value:req, bits:req
+        /* combine value with seed */
+        xor t0, \seed, \value
+.if (\bits < 64)
+        slli t0, t0, (64 - \bits)
+.endif
+
+        /* multiply by mu, which is 2^96 divided by our polynomial */
+        clmul t0, t0, MU
+
+.if (\bits == 16) || (\bits == 8)
+        clmulh t0, t0, POLY
+        /* subtract from original for smaller sizes */
+        srli t1, \seed, \bits
+        xor \seed, t0, t1
+.else
+        clmulh \seed, t0, POLY
+.endif
+
+.endm
+
+/* align buffer to 64-bits updating seed */
+/* expects SEED (a0), BUF (a1), LEN (a2), MU (a3), POLY (a4) to hold values */
+/* expects crc32_refl_excess to be called later */
+/* trashes t0 and t1 */
+.macro crc32_refl_align
+        /* is buffer already aligned to 128-bits? */
+        andi t0, BUF, 0b111
+        beqz t0, .Lalign_done
+
+.Lalign_8:
+        /* is enough buffer left? */
+        li t0, 1
+        bltu LEN, t0, .Lexcess_done
+
+        /* is buffer misaligned by one byte? */
+        andi t0, BUF, 0b001
+        beqz t0, .Lalign_16
+
+        /* perform barrett's reduction on one byte */
+        lbu t1, (BUF)
+        barrett_reduce SEED, t1, 8
+        addi LEN, LEN, -1
+        addi BUF, BUF, 1
+
+.Lalign_16:
+        li t0, 2
+        bltu LEN, t0, .Lexcess_8
+
+        andi t0, BUF, 0b010
+        beqz t0, .Lalign_32
+
+        lhu t1, (BUF)
+        barrett_reduce SEED, t1, 16
+        addi LEN, LEN, -2
+        addi BUF, BUF, 2
+
+.Lalign_32:
+        li t0, 4
+        bltu LEN, t0, .Lexcess_16
+
+        andi t0, BUF, 0b100
+        beqz t0, .Lalign_done
+
+        lwu t1, (BUF)
+        barrett_reduce SEED, t1, 32
+        addi LEN, LEN, -4
+        addi BUF, BUF, 4
+
+.Lalign_done:
+.endm
+
+/* barrett's reduce excess buffer left following fold */
+/* expects SEED (a0), BUF (a1), LEN (a2), MU (a3), POLY (a4) to hold values */
+/* expects less than 127 bits to be left in doubleword-aligned buffer */
+/* trashes t0, t1 and t3 */
+.macro crc32_refl_excess
+        /* do we have any excess left? */
+        beqz LEN, .Lexcess_done
+
+        /* barret's reduce the remaining excess */
+        /* at most there is 127 bytes left */
+.Lexcess_64:
+        andi t0, LEN, 0b1000
+        beqz t0, .Lexcess_32
+        ld t1, (BUF)
+        barrett_reduce SEED, t1, 64
+        addi BUF, BUF, 8
+
+.Lexcess_32:
+        andi t0, LEN, 0b0100
+        beqz t0, .Lexcess_16
+        lwu t1, (BUF)
+        barrett_reduce SEED, t1, 32
+        addi BUF, BUF, 4
+
+.Lexcess_16:
+        andi t0, LEN, 0b0010
+        beqz t0, .Lexcess_8
+        lhu t1, (BUF)
+        barrett_reduce SEED, t1, 16
+        addi BUF, BUF, 2
+
+.Lexcess_8:
+        andi t0, LEN, 0b0001
+        beqz t0, .Lexcess_done
+        lbu t1, (BUF)
+        barrett_reduce SEED, t1, 8
+
+.Lexcess_done:
+.endm
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake

index 039b62a29d223934afdd799b14fc95d5ce401fb6..4938ec419893b18dfc0249f7a3511e3983504d08 100644 (file)
--- a/src/include/config-h.in.cmake
+++ b/src/include/config-h.in.cmake
@@ -301,6 +301,9 @@
  /* Define if you have RISC-V ZVBC extension */
  #cmakedefine HAVE_RISCV_ZVBC 1
  
+/* Define if you have RISC-V ZBC extension */
+#cmakedefine HAVE_RISCV_ZBC 1
+
  /* Define if you have struct stat.st_mtimespec.tv_nsec */
  #cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC
author	WenLei <lei.wen2@zte.com.cn>
	Wed, 1 Apr 2026 07:29:31 +0000 (15:29 +0800)
committer	WenLei <lei.wen2@zte.com.cn>
	Tue, 7 Apr 2026 09:25:48 +0000 (17:25 +0800)
cmake/modules/SIMDExt.cmake		patch \| blob \| history
src/common/CMakeLists.txt		patch \| blob \| history
src/common/crc32c.cc		patch \| blob \| history
src/common/crc32c_riscv.c		patch \| blob \| history
src/common/crc32c_riscv.h		patch \| blob \| history
src/common/crc32c_riscv_zbc_asm.S	[new file with mode: 0644]	patch \| blob
src/common/crc32c_riscv_zbc_asm.h	[new file with mode: 0644]	patch \| blob
src/include/config-h.in.cmake		patch \| blob \| history