lib/crypto: x86/polyval: Migrate optimized code into library

author Eric Biggers <ebiggers@kernel.org>

Sun, 9 Nov 2025 23:47:20 +0000 (15:47 -0800)

committer Eric Biggers <ebiggers@kernel.org>

Tue, 11 Nov 2025 19:03:38 +0000 (11:03 -0800)
author Eric Biggers <ebiggers@kernel.org>
Sun, 9 Nov 2025 23:47:20 +0000 (15:47 -0800)
committer Eric Biggers <ebiggers@kernel.org>
Tue, 11 Nov 2025 19:03:38 +0000 (11:03 -0800)
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig

index 48d3076b60538f31858c1463ef7011853b413447..3fd2423d3cf8fef62e8a93c078abc349382ab089 100644 (file)
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
           Architecture: x86_64 using:
           - AVX2 (Advanced Vector Extensions 2)
  
-config CRYPTO_POLYVAL_CLMUL_NI
-       tristate "Hash functions: POLYVAL (CLMUL-NI)"
-       depends on 64BIT
-       select CRYPTO_POLYVAL
-       help
-         POLYVAL hash function for HCTR2
-
-         Architecture: x86_64 using:
-         - CLMUL-NI (carry-less multiplication new instructions)
-
  config CRYPTO_SM3_AVX_X86_64
         tristate "Hash functions: SM3 (AVX)"
         depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile

index 2d30d5d361458f782c3b789b5297ae52a3581b09..4a24dd38da50aade72025c3009745033ac4bacf3 100644 (file)
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -52,9 +52,6 @@ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
  obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
  
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
  obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
  obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S

deleted file mode 100644 (file)

index a6ebe4e..0000000
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
- * instructions. It works on 8 blocks at a time, by precomputing the first 8
- * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
- * allows us to split finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process  only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STRIDE_BLOCKS 8
-
-#define GSTAR %xmm7
-#define PL %xmm8
-#define PH %xmm9
-#define TMP_XMM %xmm11
-#define LO %xmm12
-#define HI %xmm13
-#define MI %xmm14
-#define SUM %xmm15
-
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
-#define TMP %rax
-
-.section    .rodata.cst16.gstar, "aM", @progbits, 16
-.align 16
-
-.Lgstar:
-       .quad 0xc200000000000000, 0xc200000000000000
-
-.text
-
-/*
- * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
- * count pointed to by MSG and KEY_POWERS.
- */
-.macro schoolbook1 count
-       .set i, 0
-       .rept (\count)
-               schoolbook1_iteration i 0
-               .set i, (i +1)
-       .endr
-.endm
-
-/*
- * Computes the product of two 128-bit polynomials at the memory locations
- * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
- * the 256-bit product into LO, MI, HI.
- *
- * Given:
- *   X = [X_1 : X_0]
- *   Y = [Y_1 : Y_0]
- *
- * We compute:
- *   LO += X_0 * Y_0
- *   MI += X_0 * Y_1 + X_1 * Y_0
- *   HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
- * extra multiplication of SUM and h^8.
- */
-.macro schoolbook1_iteration i xor_sum
-       movups (16*\i)(MSG), %xmm0
-       .if (\i == 0 && \xor_sum == 1)
-               pxor SUM, %xmm0
-       .endif
-       vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
-       vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
-       vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
-       vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
-       vpxor %xmm2, MI, MI
-       vpxor %xmm1, LO, LO
-       vpxor %xmm4, HI, HI
-       vpxor %xmm3, MI, MI
-.endm
-
-/*
- * Performs the same computation as schoolbook1_iteration, except we expect the
- * arguments to already be loaded into xmm0 and xmm1 and we set the result
- * registers LO, MI, and HI directly rather than XOR'ing into them.
- */
-.macro schoolbook1_noload
-       vpclmulqdq $0x01, %xmm0, %xmm1, MI
-       vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
-       vpclmulqdq $0x00, %xmm0, %xmm1, LO
-       vpclmulqdq $0x11, %xmm0, %xmm1, HI
-       vpxor %xmm2, MI, MI
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- */
-.macro schoolbook2
-       vpslldq $8, MI, PL
-       vpsrldq $8, MI, PH
-       pxor LO, PL
-       pxor HI, PH
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
- * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128.  To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- *   T = T_1 : T_0 = g*(x) * P_0
- *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
-       vpclmulqdq $0x00, PL, GSTAR, TMP_XMM    # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
-       pshufd $0b01001110, TMP_XMM, TMP_XMM    # TMP_XMM = T_0 : T_1
-       pxor PL, TMP_XMM                        # TMP_XMM = P_1 + T_0 : P_0 + T_1
-       pxor TMP_XMM, PH                        # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
-       pclmulqdq $0x11, GSTAR, TMP_XMM         # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
-       vpxor TMP_XMM, PH, \dest
-.endm
-
-/*
- * Compute schoolbook multiplication for 8 blocks
- * m_0h^8 + ... + m_7h^1
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- */
-.macro full_stride reduce
-       pxor LO, LO
-       pxor HI, HI
-       pxor MI, MI
-
-       schoolbook1_iteration 7 0
-       .if \reduce
-               vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
-       .endif
-
-       schoolbook1_iteration 6 0
-       .if \reduce
-               pshufd $0b01001110, TMP_XMM, TMP_XMM
-       .endif
-
-       schoolbook1_iteration 5 0
-       .if \reduce
-               pxor PL, TMP_XMM
-       .endif
-
-       schoolbook1_iteration 4 0
-       .if \reduce
-               pxor TMP_XMM, PH
-       .endif
-
-       schoolbook1_iteration 3 0
-       .if \reduce
-               pclmulqdq $0x11, GSTAR, TMP_XMM
-       .endif
-
-       schoolbook1_iteration 2 0
-       .if \reduce
-               vpxor TMP_XMM, PH, SUM
-       .endif
-
-       schoolbook1_iteration 1 0
-
-       schoolbook1_iteration 0 1
-
-       addq $(8*16), MSG
-       schoolbook2
-.endm
-
-/*
- * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
- */
-.macro partial_stride
-       mov BLOCKS_LEFT, TMP
-       shlq $4, TMP
-       addq $(16*STRIDE_BLOCKS), KEY_POWERS
-       subq TMP, KEY_POWERS
-
-       movups (MSG), %xmm0
-       pxor SUM, %xmm0
-       movaps (KEY_POWERS), %xmm1
-       schoolbook1_noload
-       dec BLOCKS_LEFT
-       addq $16, MSG
-       addq $16, KEY_POWERS
-
-       test $4, BLOCKS_LEFT
-       jz .Lpartial4BlocksDone
-       schoolbook1 4
-       addq $(4*16), MSG
-       addq $(4*16), KEY_POWERS
-.Lpartial4BlocksDone:
-       test $2, BLOCKS_LEFT
-       jz .Lpartial2BlocksDone
-       schoolbook1 2
-       addq $(2*16), MSG
-       addq $(2*16), KEY_POWERS
-.Lpartial2BlocksDone:
-       test $1, BLOCKS_LEFT
-       jz .LpartialDone
-       schoolbook1 1
-.LpartialDone:
-       schoolbook2
-       montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(clmul_polyval_mul)
-       FRAME_BEGIN
-       vmovdqa .Lgstar(%rip), GSTAR
-       movups (%rdi), %xmm0
-       movups (%rsi), %xmm1
-       schoolbook1_noload
-       schoolbook2
-       montgomery_reduction SUM
-       movups SUM, (%rdi)
-       FRAME_END
-       RET
-SYM_FUNC_END(clmul_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL.  This computes:
- *     h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
- *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- *     const u8 *in, size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(clmul_polyval_update)
-       FRAME_BEGIN
-       vmovdqa .Lgstar(%rip), GSTAR
-       movups (ACCUMULATOR), SUM
-       subq $STRIDE_BLOCKS, BLOCKS_LEFT
-       js .LstrideLoopExit
-       full_stride 0
-       subq $STRIDE_BLOCKS, BLOCKS_LEFT
-       js .LstrideLoopExitReduce
-.LstrideLoop:
-       full_stride 1
-       subq $STRIDE_BLOCKS, BLOCKS_LEFT
-       jns .LstrideLoop
-.LstrideLoopExitReduce:
-       montgomery_reduction SUM
-.LstrideLoopExit:
-       add $STRIDE_BLOCKS, BLOCKS_LEFT
-       jz .LskipPartial
-       partial_stride
-.LskipPartial:
-       movups SUM, (ACCUMULATOR)
-       FRAME_END
-       RET
-SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c

deleted file mode 100644 (file)

index 6b46686..0000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN  16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS 8
-
-struct polyval_tfm_ctx {
-       /*
-        * These powers must be in the order h^8, ..., h^1.
-        */
-       u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
-       u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
-       const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
-       return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-       const u8 *in, size_t nblocks, u8 *accumulator)
-{
-       kernel_fpu_begin();
-       clmul_polyval_update(keys, in, nblocks, accumulator);
-       kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-       kernel_fpu_begin();
-       clmul_polyval_mul(op1, op2);
-       kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
-                       const u8 *key, unsigned int keylen)
-{
-       struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
-       int i;
-
-       if (keylen != POLYVAL_BLOCK_SIZE)
-               return -EINVAL;
-
-       memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-       for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-               memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-               internal_polyval_mul(tctx->key_powers[i],
-                                    tctx->key_powers[i+1]);
-       }
-
-       return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
-       struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-       memset(dctx, 0, sizeof(*dctx));
-
-       return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
-                        const u8 *src, unsigned int srclen)
-{
-       struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-       const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-       unsigned int nblocks;
-
-       do {
-               /* Allow rescheduling every 4K bytes. */
-               nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-               internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-               srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-               src += nblocks * POLYVAL_BLOCK_SIZE;
-       } while (srclen >= POLYVAL_BLOCK_SIZE);
-
-       return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
-                            unsigned int len, u8 *dst)
-{
-       struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-       const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
-       if (len) {
-               crypto_xor(dctx->buffer, src, len);
-               internal_polyval_mul(dctx->buffer,
-                                    tctx->key_powers[NUM_KEY_POWERS-1]);
-       }
-
-       memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-       return 0;
-}
-
-static struct shash_alg polyval_alg = {
-       .digestsize     = POLYVAL_DIGEST_SIZE,
-       .init           = polyval_x86_init,
-       .update         = polyval_x86_update,
-       .finup          = polyval_x86_finup,
-       .setkey         = polyval_x86_setkey,
-       .descsize       = sizeof(struct polyval_desc_ctx),
-       .base           = {
-               .cra_name               = "polyval",
-               .cra_driver_name        = "polyval-clmulni",
-               .cra_priority           = 200,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize          = POLYVAL_BLOCK_SIZE,
-               .cra_ctxsize            = POLYVAL_CTX_SIZE,
-               .cra_module             = THIS_MODULE,
-       },
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
-       X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
-       {}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
-       if (!x86_match_cpu(pcmul_cpu_id))
-               return -ENODEV;
-
-       if (!boot_cpu_has(X86_FEATURE_AVX))
-               return -ENODEV;
-
-       return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
-       crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h

index f8aaf4275fbdab98cbb3332934bf3a3bf72b3a0d..b28b8ef113538c456f554e2308698d0c7500e69b 100644 (file)
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -48,6 +48,9 @@ struct polyval_key {
  #ifdef CONFIG_ARM64
         /** @h_powers: Powers of the hash key H^8 through H^1 */
         struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+       /** @h_powers: Powers of the hash key H^8 through H^1 */
+       struct polyval_elem h_powers[8];
  #else
  #error "Unhandled arch"
  #endif
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig

index 4307239941421dfdbae6e29a58342b1bf91787b8..9d04b3771ce21d8882ff5aef032665b0b3b197af 100644 (file)
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -145,6 +145,7 @@ config CRYPTO_LIB_POLYVAL_ARCH
         bool
         depends on CRYPTO_LIB_POLYVAL && !UML
         default y if ARM64 && KERNEL_MODE_NEON
+       default y if X86_64
  
  config CRYPTO_LIB_CHACHA20POLY1305
         tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile

index 2efa96afcb4b22063d1ae6ffb9797829326938ef..6580991f8e12dc1152d4c2ffaf6fc6d9c26da459 100644 (file)
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -203,6 +203,7 @@ libpolyval-y := polyval.o
  ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
  CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
  libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
  endif
  
  ################################################################################
diff --git a/lib/crypto/x86/polyval-pclmul-avx.S b/lib/crypto/x86/polyval-pclmul-avx.S

new file mode 100644 (file)

index 0000000..7f73946
--- /dev/null
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section    .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+       .quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+       .set i, 0
+       .rept (\count)
+               schoolbook1_iteration i 0
+               .set i, (i +1)
+       .endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *   X = [X_1 : X_0]
+ *   Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *   LO += X_0 * Y_0
+ *   MI += X_0 * Y_1 + X_1 * Y_0
+ *   HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+       movups (16*\i)(MSG), %xmm0
+       .if (\i == 0 && \xor_sum == 1)
+               pxor SUM, %xmm0
+       .endif
+       vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+       vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+       vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+       vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+       vpxor %xmm2, MI, MI
+       vpxor %xmm1, LO, LO
+       vpxor %xmm4, HI, HI
+       vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+       vpclmulqdq $0x01, %xmm0, %xmm1, MI
+       vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+       vpclmulqdq $0x00, %xmm0, %xmm1, LO
+       vpclmulqdq $0x11, %xmm0, %xmm1, HI
+       vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+       vpslldq $8, MI, PL
+       vpsrldq $8, MI, PH
+       pxor LO, PL
+       pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+       vpclmulqdq $0x00, PL, GSTAR, TMP_XMM    # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+       pshufd $0b01001110, TMP_XMM, TMP_XMM    # TMP_XMM = T_0 : T_1
+       pxor PL, TMP_XMM                        # TMP_XMM = P_1 + T_0 : P_0 + T_1
+       pxor TMP_XMM, PH                        # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+       pclmulqdq $0x11, GSTAR, TMP_XMM         # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+       vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+       pxor LO, LO
+       pxor HI, HI
+       pxor MI, MI
+
+       schoolbook1_iteration 7 0
+       .if \reduce
+               vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+       .endif
+
+       schoolbook1_iteration 6 0
+       .if \reduce
+               pshufd $0b01001110, TMP_XMM, TMP_XMM
+       .endif
+
+       schoolbook1_iteration 5 0
+       .if \reduce
+               pxor PL, TMP_XMM
+       .endif
+
+       schoolbook1_iteration 4 0
+       .if \reduce
+               pxor TMP_XMM, PH
+       .endif
+
+       schoolbook1_iteration 3 0
+       .if \reduce
+               pclmulqdq $0x11, GSTAR, TMP_XMM
+       .endif
+
+       schoolbook1_iteration 2 0
+       .if \reduce
+               vpxor TMP_XMM, PH, SUM
+       .endif
+
+       schoolbook1_iteration 1 0
+
+       schoolbook1_iteration 0 1
+
+       addq $(8*16), MSG
+       schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+       mov BLOCKS_LEFT, TMP
+       shlq $4, TMP
+       addq $(16*STRIDE_BLOCKS), KEY_POWERS
+       subq TMP, KEY_POWERS
+
+       movups (MSG), %xmm0
+       pxor SUM, %xmm0
+       movups (KEY_POWERS), %xmm1
+       schoolbook1_noload
+       dec BLOCKS_LEFT
+       addq $16, MSG
+       addq $16, KEY_POWERS
+
+       test $4, BLOCKS_LEFT
+       jz .Lpartial4BlocksDone
+       schoolbook1 4
+       addq $(4*16), MSG
+       addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+       test $2, BLOCKS_LEFT
+       jz .Lpartial2BlocksDone
+       schoolbook1 2
+       addq $(2*16), MSG
+       addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+       test $1, BLOCKS_LEFT
+       jz .LpartialDone
+       schoolbook1 1
+.LpartialDone:
+       schoolbook2
+       montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ *                            const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+       FRAME_BEGIN
+       vmovdqa .Lgstar(%rip), GSTAR
+       movups (%rdi), %xmm0
+       movups (%rsi), %xmm1
+       schoolbook1_noload
+       schoolbook2
+       montgomery_reduction SUM
+       movups SUM, (%rdi)
+       FRAME_END
+       RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *     h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ *                               const struct polyval_key *key,
+ *                               const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+       FRAME_BEGIN
+       vmovdqa .Lgstar(%rip), GSTAR
+       movups (ACCUMULATOR), SUM
+       subq $STRIDE_BLOCKS, BLOCKS_LEFT
+       js .LstrideLoopExit
+       full_stride 0
+       subq $STRIDE_BLOCKS, BLOCKS_LEFT
+       js .LstrideLoopExitReduce
+.LstrideLoop:
+       full_stride 1
+       subq $STRIDE_BLOCKS, BLOCKS_LEFT
+       jns .LstrideLoop
+.LstrideLoopExitReduce:
+       montgomery_reduction SUM
+.LstrideLoopExit:
+       add $STRIDE_BLOCKS, BLOCKS_LEFT
+       jz .LskipPartial
+       partial_stride
+.LskipPartial:
+       movups SUM, (ACCUMULATOR)
+       FRAME_END
+       RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h

new file mode 100644 (file)

index 0000000..ef87975
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+                                      const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+                                         const struct polyval_key *key,
+                                         const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+                                   const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+       static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+       memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+       if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+               kernel_fpu_begin();
+               for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+                       key->h_powers[i] = key->h_powers[i + 1];
+                       polyval_mul_pclmul_avx(
+                               &key->h_powers[i],
+                               &key->h_powers[NUM_H_POWERS - 1]);
+               }
+               kernel_fpu_end();
+       } else {
+               for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+                       key->h_powers[i] = key->h_powers[i + 1];
+                       polyval_mul_generic(&key->h_powers[i],
+                                           &key->h_powers[NUM_H_POWERS - 1]);
+               }
+       }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+                            const struct polyval_key *key)
+{
+       if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+               kernel_fpu_begin();
+               polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+               kernel_fpu_end();
+       } else {
+               polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+       }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+                               const struct polyval_key *key,
+                               const u8 *data, size_t nblocks)
+{
+       if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+               do {
+                       /* Allow rescheduling every 4 KiB. */
+                       size_t n = min_t(size_t, nblocks,
+                                        4096 / POLYVAL_BLOCK_SIZE);
+
+                       kernel_fpu_begin();
+                       polyval_blocks_pclmul_avx(acc, key, data, n);
+                       kernel_fpu_end();
+                       data += n * POLYVAL_BLOCK_SIZE;
+                       nblocks -= n;
+               } while (nblocks);
+       } else {
+               polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+                                      data, nblocks);
+       }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+       if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+           boot_cpu_has(X86_FEATURE_AVX))
+               static_branch_enable(&have_pclmul_avx);
+}
author	Eric Biggers <ebiggers@kernel.org>
	Sun, 9 Nov 2025 23:47:20 +0000 (15:47 -0800)
committer	Eric Biggers <ebiggers@kernel.org>
	Tue, 11 Nov 2025 19:03:38 +0000 (11:03 -0800)
arch/x86/crypto/Kconfig		patch \| blob \| history
arch/x86/crypto/Makefile		patch \| blob \| history
arch/x86/crypto/polyval-clmulni_asm.S	[deleted file]	patch \| blob \| history
arch/x86/crypto/polyval-clmulni_glue.c	[deleted file]	patch \| blob \| history
include/crypto/polyval.h		patch \| blob \| history
lib/crypto/Kconfig		patch \| blob \| history
lib/crypto/Makefile		patch \| blob \| history
lib/crypto/x86/polyval-pclmul-avx.S	[new file with mode: 0644]	patch \| blob
lib/crypto/x86/polyval.h	[new file with mode: 0644]	patch \| blob