Architecture: x86_64 using:
- AVX2 (Advanced Vector Extensions 2)
-config CRYPTO_POLYVAL_CLMUL_NI
- tristate "Hash functions: POLYVAL (CLMUL-NI)"
- depends on 64BIT
- select CRYPTO_POLYVAL
- help
- POLYVAL hash function for HCTR2
-
- Architecture: x86_64 using:
- - CLMUL-NI (carry-less multiplication new instructions)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
- * instructions. It works on 8 blocks at a time, by precomputing the first 8
- * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
- * allows us to split finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STRIDE_BLOCKS 8
-
-#define GSTAR %xmm7
-#define PL %xmm8
-#define PH %xmm9
-#define TMP_XMM %xmm11
-#define LO %xmm12
-#define HI %xmm13
-#define MI %xmm14
-#define SUM %xmm15
-
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
-#define TMP %rax
-
-.section .rodata.cst16.gstar, "aM", @progbits, 16
-.align 16
-
-.Lgstar:
- .quad 0xc200000000000000, 0xc200000000000000
-
-.text
-
-/*
- * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
- * count pointed to by MSG and KEY_POWERS.
- */
-.macro schoolbook1 count
- .set i, 0
- .rept (\count)
- schoolbook1_iteration i 0
- .set i, (i +1)
- .endr
-.endm
-
-/*
- * Computes the product of two 128-bit polynomials at the memory locations
- * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
- * the 256-bit product into LO, MI, HI.
- *
- * Given:
- * X = [X_1 : X_0]
- * Y = [Y_1 : Y_0]
- *
- * We compute:
- * LO += X_0 * Y_0
- * MI += X_0 * Y_1 + X_1 * Y_0
- * HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
- * extra multiplication of SUM and h^8.
- */
-.macro schoolbook1_iteration i xor_sum
- movups (16*\i)(MSG), %xmm0
- .if (\i == 0 && \xor_sum == 1)
- pxor SUM, %xmm0
- .endif
- vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
- vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
- vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
- vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
- vpxor %xmm2, MI, MI
- vpxor %xmm1, LO, LO
- vpxor %xmm4, HI, HI
- vpxor %xmm3, MI, MI
-.endm
-
-/*
- * Performs the same computation as schoolbook1_iteration, except we expect the
- * arguments to already be loaded into xmm0 and xmm1 and we set the result
- * registers LO, MI, and HI directly rather than XOR'ing into them.
- */
-.macro schoolbook1_noload
- vpclmulqdq $0x01, %xmm0, %xmm1, MI
- vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
- vpclmulqdq $0x00, %xmm0, %xmm1, LO
- vpclmulqdq $0x11, %xmm0, %xmm1, HI
- vpxor %xmm2, MI, MI
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- */
-.macro schoolbook2
- vpslldq $8, MI, PL
- vpsrldq $8, MI, PH
- pxor LO, PL
- pxor HI, PH
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form. We need to reduce it
- * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128. To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- * T = T_1 : T_0 = g*(x) * P_0
- * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
- vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
- pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
- pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
- pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
- pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
- vpxor TMP_XMM, PH, \dest
-.endm
-
-/*
- * Compute schoolbook multiplication for 8 blocks
- * m_0h^8 + ... + m_7h^1
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- */
-.macro full_stride reduce
- pxor LO, LO
- pxor HI, HI
- pxor MI, MI
-
- schoolbook1_iteration 7 0
- .if \reduce
- vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
- .endif
-
- schoolbook1_iteration 6 0
- .if \reduce
- pshufd $0b01001110, TMP_XMM, TMP_XMM
- .endif
-
- schoolbook1_iteration 5 0
- .if \reduce
- pxor PL, TMP_XMM
- .endif
-
- schoolbook1_iteration 4 0
- .if \reduce
- pxor TMP_XMM, PH
- .endif
-
- schoolbook1_iteration 3 0
- .if \reduce
- pclmulqdq $0x11, GSTAR, TMP_XMM
- .endif
-
- schoolbook1_iteration 2 0
- .if \reduce
- vpxor TMP_XMM, PH, SUM
- .endif
-
- schoolbook1_iteration 1 0
-
- schoolbook1_iteration 0 1
-
- addq $(8*16), MSG
- schoolbook2
-.endm
-
-/*
- * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
- */
-.macro partial_stride
- mov BLOCKS_LEFT, TMP
- shlq $4, TMP
- addq $(16*STRIDE_BLOCKS), KEY_POWERS
- subq TMP, KEY_POWERS
-
- movups (MSG), %xmm0
- pxor SUM, %xmm0
- movaps (KEY_POWERS), %xmm1
- schoolbook1_noload
- dec BLOCKS_LEFT
- addq $16, MSG
- addq $16, KEY_POWERS
-
- test $4, BLOCKS_LEFT
- jz .Lpartial4BlocksDone
- schoolbook1 4
- addq $(4*16), MSG
- addq $(4*16), KEY_POWERS
-.Lpartial4BlocksDone:
- test $2, BLOCKS_LEFT
- jz .Lpartial2BlocksDone
- schoolbook1 2
- addq $(2*16), MSG
- addq $(2*16), KEY_POWERS
-.Lpartial2BlocksDone:
- test $1, BLOCKS_LEFT
- jz .LpartialDone
- schoolbook1 1
-.LpartialDone:
- schoolbook2
- montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(clmul_polyval_mul)
- FRAME_BEGIN
- vmovdqa .Lgstar(%rip), GSTAR
- movups (%rdi), %xmm0
- movups (%rsi), %xmm1
- schoolbook1_noload
- schoolbook2
- montgomery_reduction SUM
- movups SUM, (%rdi)
- FRAME_END
- RET
-SYM_FUNC_END(clmul_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL. This computes:
- * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
- *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- * const u8 *in, size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(clmul_polyval_update)
- FRAME_BEGIN
- vmovdqa .Lgstar(%rip), GSTAR
- movups (ACCUMULATOR), SUM
- subq $STRIDE_BLOCKS, BLOCKS_LEFT
- js .LstrideLoopExit
- full_stride 0
- subq $STRIDE_BLOCKS, BLOCKS_LEFT
- js .LstrideLoopExitReduce
-.LstrideLoop:
- full_stride 1
- subq $STRIDE_BLOCKS, BLOCKS_LEFT
- jns .LstrideLoop
-.LstrideLoopExitReduce:
- montgomery_reduction SUM
-.LstrideLoopExit:
- add $STRIDE_BLOCKS, BLOCKS_LEFT
- jz .LskipPartial
- partial_stride
-.LskipPartial:
- movups SUM, (ACCUMULATOR)
- FRAME_END
- RET
-SYM_FUNC_END(clmul_polyval_update)
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN 16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS 8
-
-struct polyval_tfm_ctx {
- /*
- * These powers must be in the order h^8, ..., h^1.
- */
- u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
- u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
- return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator)
-{
- kernel_fpu_begin();
- clmul_polyval_update(keys, in, nblocks, accumulator);
- kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
- kernel_fpu_begin();
- clmul_polyval_mul(op1, op2);
- kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
- const u8 *key, unsigned int keylen)
-{
- struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
- int i;
-
- if (keylen != POLYVAL_BLOCK_SIZE)
- return -EINVAL;
-
- memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
- for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
- memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
- internal_polyval_mul(tctx->key_powers[i],
- tctx->key_powers[i+1]);
- }
-
- return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
- memset(dctx, 0, sizeof(*dctx));
-
- return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
- const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
- unsigned int nblocks;
-
- do {
- /* Allow rescheduling every 4K bytes. */
- nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
- internal_polyval_update(tctx, src, nblocks, dctx->buffer);
- srclen -= nblocks * POLYVAL_BLOCK_SIZE;
- src += nblocks * POLYVAL_BLOCK_SIZE;
- } while (srclen >= POLYVAL_BLOCK_SIZE);
-
- return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *dst)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
- const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
- if (len) {
- crypto_xor(dctx->buffer, src, len);
- internal_polyval_mul(dctx->buffer,
- tctx->key_powers[NUM_KEY_POWERS-1]);
- }
-
- memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
- return 0;
-}
-
-static struct shash_alg polyval_alg = {
- .digestsize = POLYVAL_DIGEST_SIZE,
- .init = polyval_x86_init,
- .update = polyval_x86_update,
- .finup = polyval_x86_finup,
- .setkey = polyval_x86_setkey,
- .descsize = sizeof(struct polyval_desc_ctx),
- .base = {
- .cra_name = "polyval",
- .cra_driver_name = "polyval-clmulni",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = POLYVAL_BLOCK_SIZE,
- .cra_ctxsize = POLYVAL_CTX_SIZE,
- .cra_module = THIS_MODULE,
- },
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
- X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
- if (!x86_match_cpu(pcmul_cpu_id))
- return -ENODEV;
-
- if (!boot_cpu_has(X86_FEATURE_AVX))
- return -ENODEV;
-
- return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
- crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
#ifdef CONFIG_ARM64
/** @h_powers: Powers of the hash key H^8 through H^1 */
struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+ /** @h_powers: Powers of the hash key H^8 through H^1 */
+ struct polyval_elem h_powers[8];
#else
#error "Unhandled arch"
#endif
bool
depends on CRYPTO_LIB_POLYVAL && !UML
default y if ARM64 && KERNEL_MODE_NEON
+ default y if X86_64
config CRYPTO_LIB_CHACHA20POLY1305
tristate
ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
endif
################################################################################
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+ .set i, 0
+ .rept (\count)
+ schoolbook1_iteration i 0
+ .set i, (i +1)
+ .endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += X_0 * Y_1 + X_1 * Y_0
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+ movups (16*\i)(MSG), %xmm0
+ .if (\i == 0 && \xor_sum == 1)
+ pxor SUM, %xmm0
+ .endif
+ vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+ vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+ vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+ vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+ vpxor %xmm2, MI, MI
+ vpxor %xmm1, LO, LO
+ vpxor %xmm4, HI, HI
+ vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+ vpclmulqdq $0x01, %xmm0, %xmm1, MI
+ vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+ vpclmulqdq $0x00, %xmm0, %xmm1, LO
+ vpclmulqdq $0x11, %xmm0, %xmm1, HI
+ vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+ vpslldq $8, MI, PL
+ vpsrldq $8, MI, PH
+ pxor LO, PL
+ pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+ pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
+ pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
+ pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+ vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+ pxor LO, LO
+ pxor HI, HI
+ pxor MI, MI
+
+ schoolbook1_iteration 7 0
+ .if \reduce
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 6 0
+ .if \reduce
+ pshufd $0b01001110, TMP_XMM, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 5 0
+ .if \reduce
+ pxor PL, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 4 0
+ .if \reduce
+ pxor TMP_XMM, PH
+ .endif
+
+ schoolbook1_iteration 3 0
+ .if \reduce
+ pclmulqdq $0x11, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 2 0
+ .if \reduce
+ vpxor TMP_XMM, PH, SUM
+ .endif
+
+ schoolbook1_iteration 1 0
+
+ schoolbook1_iteration 0 1
+
+ addq $(8*16), MSG
+ schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+ mov BLOCKS_LEFT, TMP
+ shlq $4, TMP
+ addq $(16*STRIDE_BLOCKS), KEY_POWERS
+ subq TMP, KEY_POWERS
+
+ movups (MSG), %xmm0
+ pxor SUM, %xmm0
+ movups (KEY_POWERS), %xmm1
+ schoolbook1_noload
+ dec BLOCKS_LEFT
+ addq $16, MSG
+ addq $16, KEY_POWERS
+
+ test $4, BLOCKS_LEFT
+ jz .Lpartial4BlocksDone
+ schoolbook1 4
+ addq $(4*16), MSG
+ addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+ test $2, BLOCKS_LEFT
+ jz .Lpartial2BlocksDone
+ schoolbook1 2
+ addq $(2*16), MSG
+ addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+ test $1, BLOCKS_LEFT
+ jz .LpartialDone
+ schoolbook1 1
+.LpartialDone:
+ schoolbook2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ * const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (%rdi), %xmm0
+ movups (%rsi), %xmm1
+ schoolbook1_noload
+ schoolbook2
+ montgomery_reduction SUM
+ movups SUM, (%rdi)
+ FRAME_END
+ RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ * const struct polyval_key *key,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (ACCUMULATOR), SUM
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExit
+ full_stride 0
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ jns .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ add $STRIDE_BLOCKS, BLOCKS_LEFT
+ jz .LskipPartial
+ partial_stride
+.LskipPartial:
+ movups SUM, (ACCUMULATOR)
+ FRAME_END
+ RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+ memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_pclmul_avx(
+ &key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ kernel_fpu_end();
+ } else {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_generic(&key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+ const struct polyval_key *key)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / POLYVAL_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ polyval_blocks_pclmul_avx(acc, key, data, n);
+ kernel_fpu_end();
+ data += n * POLYVAL_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
+ } else {
+ polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+ data, nblocks);
+ }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ static_branch_enable(&have_pclmul_avx);
+}