]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-client.git/commitdiff
lib/crypto: arm64/sha3: Migrate optimized code into library
authorEric Biggers <ebiggers@kernel.org>
Sun, 26 Oct 2025 05:50:26 +0000 (22:50 -0700)
committerEric Biggers <ebiggers@kernel.org>
Thu, 6 Nov 2025 04:02:35 +0000 (20:02 -0800)
Instead of exposing the arm64-optimized SHA-3 code via arm64-specific
crypto_shash algorithms, instead just implement the sha3_absorb_blocks()
and sha3_keccakf() library functions.  This is much simpler, it makes
the SHA-3 library functions be arm64-optimized, and it fixes the
longstanding issue where the arm64-optimized SHA-3 code was disabled by
default.  SHA-3 still remains available through crypto_shash, but
individual architectures no longer need to handle it.

Note: to see the diff from arch/arm64/crypto/sha3-ce-glue.c to
lib/crypto/arm64/sha3.h, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-10-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
arch/arm64/configs/defconfig
arch/arm64/crypto/Kconfig
arch/arm64/crypto/Makefile
arch/arm64/crypto/sha3-ce-core.S [deleted file]
arch/arm64/crypto/sha3-ce-glue.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/arm64/sha3-ce-core.S [new file with mode: 0644]
lib/crypto/arm64/sha3.h [new file with mode: 0644]

index e3a2d37bd10423b028f59dc40d6e8ee1c610d6b8..20dd3a39faead28e8f632c9d3f3073106e6ed93d 100644 (file)
@@ -1783,10 +1783,10 @@ CONFIG_CRYPTO_CHACHA20=m
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_AES_ARM64_BS=m
index 91f3093eee6ab51c5259c9ec1fb33c00c0a40799..376d6b50743fff170d78a37ac60b83264eef73ba 100644 (file)
@@ -25,17 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
          Architecture: arm64 using:
          - NEON (Advanced SIMD) extensions
 
-config CRYPTO_SHA3_ARM64
-       tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
-       depends on KERNEL_MODE_NEON
-       select CRYPTO_HASH
-       select CRYPTO_SHA3
-       help
-         SHA-3 secure hash algorithms (FIPS 202)
-
-         Architecture: arm64 using:
-         - ARMv8.2 Crypto Extensions
-
 config CRYPTO_SM3_NEON
        tristate "Hash functions: SM3 (NEON)"
        depends on KERNEL_MODE_NEON
index a8b2cdbe202c16befaf1ebd8a97406fea0cf9a34..fd3d590fa113797dd270a71475d83998276e16cb 100644 (file)
@@ -5,9 +5,6 @@
 # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
-sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o
 sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o
 
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
deleted file mode 100644 (file)
index b62bd71..0000000
+++ /dev/null
@@ -1,213 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .irp    b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-       .set    .Lv\b\().2d, \b
-       .set    .Lv\b\().16b, \b
-       .endr
-
-       /*
-        * ARMv8.2 Crypto Extensions instructions
-        */
-       .macro  eor3, rd, rn, rm, ra
-       .inst   0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-       .endm
-
-       .macro  rax1, rd, rn, rm
-       .inst   0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-       .endm
-
-       .macro  bcax, rd, rn, rm, ra
-       .inst   0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-       .endm
-
-       .macro  xar, rd, rn, rm, imm6
-       .inst   0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
-       .endm
-
-       /*
-        * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
-        *                          size_t nblocks, size_t block_size)
-        *
-        * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
-        * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
-        */
-       .text
-SYM_FUNC_START(sha3_ce_transform)
-       /* load state */
-       add     x8, x0, #32
-       ld1     { v0.1d- v3.1d}, [x0]
-       ld1     { v4.1d- v7.1d}, [x8], #32
-       ld1     { v8.1d-v11.1d}, [x8], #32
-       ld1     {v12.1d-v15.1d}, [x8], #32
-       ld1     {v16.1d-v19.1d}, [x8], #32
-       ld1     {v20.1d-v23.1d}, [x8], #32
-       ld1     {v24.1d}, [x8]
-
-0:     sub     x2, x2, #1
-       mov     w8, #24
-       adr_l   x9, .Lsha3_rcon
-
-       /* load input */
-       ld1     {v25.8b-v28.8b}, [x1], #32
-       ld1     {v29.8b}, [x1], #8
-       eor     v0.8b, v0.8b, v25.8b
-       eor     v1.8b, v1.8b, v26.8b
-       eor     v2.8b, v2.8b, v27.8b
-       eor     v3.8b, v3.8b, v28.8b
-       eor     v4.8b, v4.8b, v29.8b
-
-       ld1     {v25.8b-v28.8b}, [x1], #32
-       eor     v5.8b, v5.8b, v25.8b
-       eor     v6.8b, v6.8b, v26.8b
-       eor     v7.8b, v7.8b, v27.8b
-       eor     v8.8b, v8.8b, v28.8b
-       cmp     x3, #72
-       b.eq    3f      /* SHA3-512 (block_size=72)? */
-
-       ld1     {v25.8b-v28.8b}, [x1], #32
-       eor     v9.8b, v9.8b, v25.8b
-       eor     v10.8b, v10.8b, v26.8b
-       eor     v11.8b, v11.8b, v27.8b
-       eor     v12.8b, v12.8b, v28.8b
-       cmp     x3, #104
-       b.eq    3f      /* SHA3-384 (block_size=104)? */
-
-       ld1     {v25.8b-v28.8b}, [x1], #32
-       eor     v13.8b, v13.8b, v25.8b
-       eor     v14.8b, v14.8b, v26.8b
-       eor     v15.8b, v15.8b, v27.8b
-       eor     v16.8b, v16.8b, v28.8b
-       cmp     x3, #144
-       b.lt    3f      /* SHA3-256 or SHAKE256 (block_size=136)? */
-       b.eq    2f      /* SHA3-224 (block_size=144)? */
-
-       /* SHAKE128 (block_size=168) */
-       ld1     {v25.8b-v28.8b}, [x1], #32
-       eor     v17.8b, v17.8b, v25.8b
-       eor     v18.8b, v18.8b, v26.8b
-       eor     v19.8b, v19.8b, v27.8b
-       eor     v20.8b, v20.8b, v28.8b
-       b       3f
-2:
-       /* SHA3-224 (block_size=144) */
-       ld1     {v25.8b}, [x1], #8
-       eor     v17.8b, v17.8b, v25.8b
-
-3:     sub     w8, w8, #1
-
-       eor3    v29.16b,  v4.16b,  v9.16b, v14.16b
-       eor3    v26.16b,  v1.16b,  v6.16b, v11.16b
-       eor3    v28.16b,  v3.16b,  v8.16b, v13.16b
-       eor3    v25.16b,  v0.16b,  v5.16b, v10.16b
-       eor3    v27.16b,  v2.16b,  v7.16b, v12.16b
-       eor3    v29.16b, v29.16b, v19.16b, v24.16b
-       eor3    v26.16b, v26.16b, v16.16b, v21.16b
-       eor3    v28.16b, v28.16b, v18.16b, v23.16b
-       eor3    v25.16b, v25.16b, v15.16b, v20.16b
-       eor3    v27.16b, v27.16b, v17.16b, v22.16b
-
-       rax1    v30.2d, v29.2d, v26.2d  // bc[0]
-       rax1    v26.2d, v26.2d, v28.2d  // bc[2]
-       rax1    v28.2d, v28.2d, v25.2d  // bc[4]
-       rax1    v25.2d, v25.2d, v27.2d  // bc[1]
-       rax1    v27.2d, v27.2d, v29.2d  // bc[3]
-
-       eor      v0.16b,  v0.16b, v30.16b
-       xar      v29.2d,   v1.2d,  v25.2d, (64 - 1)
-       xar       v1.2d,   v6.2d,  v25.2d, (64 - 44)
-       xar       v6.2d,   v9.2d,  v28.2d, (64 - 20)
-       xar       v9.2d,  v22.2d,  v26.2d, (64 - 61)
-       xar      v22.2d,  v14.2d,  v28.2d, (64 - 39)
-       xar      v14.2d,  v20.2d,  v30.2d, (64 - 18)
-       xar      v31.2d,   v2.2d,  v26.2d, (64 - 62)
-       xar       v2.2d,  v12.2d,  v26.2d, (64 - 43)
-       xar      v12.2d,  v13.2d,  v27.2d, (64 - 25)
-       xar      v13.2d,  v19.2d,  v28.2d, (64 - 8)
-       xar      v19.2d,  v23.2d,  v27.2d, (64 - 56)
-       xar      v23.2d,  v15.2d,  v30.2d, (64 - 41)
-       xar      v15.2d,   v4.2d,  v28.2d, (64 - 27)
-       xar      v28.2d,  v24.2d,  v28.2d, (64 - 14)
-       xar      v24.2d,  v21.2d,  v25.2d, (64 - 2)
-       xar       v8.2d,   v8.2d,  v27.2d, (64 - 55)
-       xar       v4.2d,  v16.2d,  v25.2d, (64 - 45)
-       xar      v16.2d,   v5.2d,  v30.2d, (64 - 36)
-       xar       v5.2d,   v3.2d,  v27.2d, (64 - 28)
-       xar      v27.2d,  v18.2d,  v27.2d, (64 - 21)
-       xar       v3.2d,  v17.2d,  v26.2d, (64 - 15)
-       xar      v25.2d,  v11.2d,  v25.2d, (64 - 10)
-       xar      v26.2d,   v7.2d,  v26.2d, (64 - 6)
-       xar      v30.2d,  v10.2d,  v30.2d, (64 - 3)
-
-       bcax    v20.16b, v31.16b, v22.16b,  v8.16b
-       bcax    v21.16b,  v8.16b, v23.16b, v22.16b
-       bcax    v22.16b, v22.16b, v24.16b, v23.16b
-       bcax    v23.16b, v23.16b, v31.16b, v24.16b
-       bcax    v24.16b, v24.16b,  v8.16b, v31.16b
-
-       ld1r    {v31.2d}, [x9], #8
-
-       bcax    v17.16b, v25.16b, v19.16b,  v3.16b
-       bcax    v18.16b,  v3.16b, v15.16b, v19.16b
-       bcax    v19.16b, v19.16b, v16.16b, v15.16b
-       bcax    v15.16b, v15.16b, v25.16b, v16.16b
-       bcax    v16.16b, v16.16b,  v3.16b, v25.16b
-
-       bcax    v10.16b, v29.16b, v12.16b, v26.16b
-       bcax    v11.16b, v26.16b, v13.16b, v12.16b
-       bcax    v12.16b, v12.16b, v14.16b, v13.16b
-       bcax    v13.16b, v13.16b, v29.16b, v14.16b
-       bcax    v14.16b, v14.16b, v26.16b, v29.16b
-
-       bcax     v7.16b, v30.16b,  v9.16b,  v4.16b
-       bcax     v8.16b,  v4.16b,  v5.16b,  v9.16b
-       bcax     v9.16b,  v9.16b,  v6.16b,  v5.16b
-       bcax     v5.16b,  v5.16b, v30.16b,  v6.16b
-       bcax     v6.16b,  v6.16b,  v4.16b, v30.16b
-
-       bcax     v3.16b, v27.16b,  v0.16b, v28.16b
-       bcax     v4.16b, v28.16b,  v1.16b,  v0.16b
-       bcax     v0.16b,  v0.16b,  v2.16b,  v1.16b
-       bcax     v1.16b,  v1.16b, v27.16b,  v2.16b
-       bcax     v2.16b,  v2.16b, v28.16b, v27.16b
-
-       eor      v0.16b,  v0.16b, v31.16b
-
-       cbnz    w8, 3b
-       cond_yield 4f, x8, x9
-       cbnz    x2, 0b
-
-       /* save state */
-4:     st1     { v0.1d- v3.1d}, [x0], #32
-       st1     { v4.1d- v7.1d}, [x0], #32
-       st1     { v8.1d-v11.1d}, [x0], #32
-       st1     {v12.1d-v15.1d}, [x0], #32
-       st1     {v16.1d-v19.1d}, [x0], #32
-       st1     {v20.1d-v23.1d}, [x0], #32
-       st1     {v24.1d}, [x0]
-       mov     x0, x2
-       ret
-SYM_FUNC_END(sha3_ce_transform)
-
-       .section        ".rodata", "a"
-       .align          8
-.Lsha3_rcon:
-       .quad   0x0000000000000001, 0x0000000000008082, 0x800000000000808a
-       .quad   0x8000000080008000, 0x000000000000808b, 0x0000000080000001
-       .quad   0x8000000080008081, 0x8000000000008009, 0x000000000000008a
-       .quad   0x0000000000000088, 0x0000000080008009, 0x000000008000000a
-       .quad   0x000000008000808b, 0x800000000000008b, 0x8000000000008089
-       .quad   0x8000000000008003, 0x8000000000008002, 0x8000000000000080
-       .quad   0x000000000000800a, 0x800000008000000a, 0x8000000080008081
-       .quad   0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
deleted file mode 100644 (file)
index 250f4fb..0000000
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
-                                   size_t nblocks, size_t block_size);
-
-static int arm64_sha3_update(struct shash_desc *desc, const u8 *data,
-                            unsigned int len)
-{
-       struct sha3_state *sctx = shash_desc_ctx(desc);
-       struct crypto_shash *tfm = desc->tfm;
-       unsigned int bs;
-       int blocks;
-
-       bs = crypto_shash_blocksize(tfm);
-       blocks = len / bs;
-       len -= blocks * bs;
-       do {
-               int rem;
-
-               kernel_neon_begin();
-               rem = sha3_ce_transform(sctx, data, blocks, bs);
-               kernel_neon_end();
-               data += (blocks - rem) * bs;
-               blocks = rem;
-       } while (blocks);
-       return len;
-}
-
-static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-                     u8 *out)
-{
-       struct sha3_state *sctx = shash_desc_ctx(desc);
-       struct crypto_shash *tfm = desc->tfm;
-       __le64 *digest = (__le64 *)out;
-       u8 block[SHA3_224_BLOCK_SIZE];
-       unsigned int bs, ds;
-       int i;
-
-       ds = crypto_shash_digestsize(tfm);
-       bs = crypto_shash_blocksize(tfm);
-       memcpy(block, src, len);
-
-       block[len++] = 0x06;
-       memset(block + len, 0, bs - len);
-       block[bs - 1] |= 0x80;
-
-       kernel_neon_begin();
-       sha3_ce_transform(sctx, block, 1, bs);
-       kernel_neon_end();
-       memzero_explicit(block , sizeof(block));
-
-       for (i = 0; i < ds / 8; i++)
-               put_unaligned_le64(sctx->st[i], digest++);
-
-       if (ds & 4)
-               put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-       return 0;
-}
-
-static struct shash_alg algs[] = { {
-       .digestsize             = SHA3_224_DIGEST_SIZE,
-       .init                   = crypto_sha3_init,
-       .update                 = arm64_sha3_update,
-       .finup                  = sha3_finup,
-       .descsize               = SHA3_STATE_SIZE,
-       .base.cra_name          = "sha3-224",
-       .base.cra_driver_name   = "sha3-224-ce",
-       .base.cra_flags         = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-       .base.cra_blocksize     = SHA3_224_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-       .base.cra_priority      = 200,
-}, {
-       .digestsize             = SHA3_256_DIGEST_SIZE,
-       .init                   = crypto_sha3_init,
-       .update                 = arm64_sha3_update,
-       .finup                  = sha3_finup,
-       .descsize               = SHA3_STATE_SIZE,
-       .base.cra_name          = "sha3-256",
-       .base.cra_driver_name   = "sha3-256-ce",
-       .base.cra_flags         = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-       .base.cra_blocksize     = SHA3_256_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-       .base.cra_priority      = 200,
-}, {
-       .digestsize             = SHA3_384_DIGEST_SIZE,
-       .init                   = crypto_sha3_init,
-       .update                 = arm64_sha3_update,
-       .finup                  = sha3_finup,
-       .descsize               = SHA3_STATE_SIZE,
-       .base.cra_name          = "sha3-384",
-       .base.cra_driver_name   = "sha3-384-ce",
-       .base.cra_flags         = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-       .base.cra_blocksize     = SHA3_384_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-       .base.cra_priority      = 200,
-}, {
-       .digestsize             = SHA3_512_DIGEST_SIZE,
-       .init                   = crypto_sha3_init,
-       .update                 = arm64_sha3_update,
-       .finup                  = sha3_finup,
-       .descsize               = SHA3_STATE_SIZE,
-       .base.cra_name          = "sha3-512",
-       .base.cra_driver_name   = "sha3-512-ce",
-       .base.cra_flags         = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-       .base.cra_blocksize     = SHA3_512_BLOCK_SIZE,
-       .base.cra_module        = THIS_MODULE,
-       .base.cra_priority      = 200,
-} };
-
-static int __init sha3_neon_mod_init(void)
-{
-       return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_neon_mod_fini(void)
-{
-       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA3, sha3_neon_mod_init);
-module_exit(sha3_neon_mod_fini);
index a05f5a349cd8cebf38b2675a62e045077e8279e5..587490ca65654940c85370f57dae1a7577a5c9ae 100644 (file)
@@ -202,6 +202,11 @@ config CRYPTO_LIB_SHA3
          The SHA3 library functions.  Select this if your module uses any of
          the functions from <crypto/sha3.h>.
 
+config CRYPTO_LIB_SHA3_ARCH
+       bool
+       depends on CRYPTO_LIB_SHA3 && !UML
+       default y if ARM64 && KERNEL_MODE_NEON
+
 config CRYPTO_LIB_SM3
        tristate
 
index 0cfdb511f32b6d85f5327525fab05bb09d7cd907..5515e73bfd5e33ba90a44c32648fe130c6785c19 100644 (file)
@@ -281,6 +281,11 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
 libsha3-y := sha3.o
 
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
 ################################################################################
 
 obj-$(CONFIG_MPILIB) += mpi/
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
new file mode 100644 (file)
index 0000000..b62bd71
--- /dev/null
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .irp    b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+       .set    .Lv\b\().2d, \b
+       .set    .Lv\b\().16b, \b
+       .endr
+
+       /*
+        * ARMv8.2 Crypto Extensions instructions
+        */
+       .macro  eor3, rd, rn, rm, ra
+       .inst   0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+       .endm
+
+       .macro  rax1, rd, rn, rm
+       .inst   0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+       .endm
+
+       .macro  bcax, rd, rn, rm, ra
+       .inst   0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+       .endm
+
+       .macro  xar, rd, rn, rm, imm6
+       .inst   0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+       .endm
+
+       /*
+        * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+        *                          size_t nblocks, size_t block_size)
+        *
+        * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+        * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
+        */
+       .text
+SYM_FUNC_START(sha3_ce_transform)
+       /* load state */
+       add     x8, x0, #32
+       ld1     { v0.1d- v3.1d}, [x0]
+       ld1     { v4.1d- v7.1d}, [x8], #32
+       ld1     { v8.1d-v11.1d}, [x8], #32
+       ld1     {v12.1d-v15.1d}, [x8], #32
+       ld1     {v16.1d-v19.1d}, [x8], #32
+       ld1     {v20.1d-v23.1d}, [x8], #32
+       ld1     {v24.1d}, [x8]
+
+0:     sub     x2, x2, #1
+       mov     w8, #24
+       adr_l   x9, .Lsha3_rcon
+
+       /* load input */
+       ld1     {v25.8b-v28.8b}, [x1], #32
+       ld1     {v29.8b}, [x1], #8
+       eor     v0.8b, v0.8b, v25.8b
+       eor     v1.8b, v1.8b, v26.8b
+       eor     v2.8b, v2.8b, v27.8b
+       eor     v3.8b, v3.8b, v28.8b
+       eor     v4.8b, v4.8b, v29.8b
+
+       ld1     {v25.8b-v28.8b}, [x1], #32
+       eor     v5.8b, v5.8b, v25.8b
+       eor     v6.8b, v6.8b, v26.8b
+       eor     v7.8b, v7.8b, v27.8b
+       eor     v8.8b, v8.8b, v28.8b
+       cmp     x3, #72
+       b.eq    3f      /* SHA3-512 (block_size=72)? */
+
+       ld1     {v25.8b-v28.8b}, [x1], #32
+       eor     v9.8b, v9.8b, v25.8b
+       eor     v10.8b, v10.8b, v26.8b
+       eor     v11.8b, v11.8b, v27.8b
+       eor     v12.8b, v12.8b, v28.8b
+       cmp     x3, #104
+       b.eq    3f      /* SHA3-384 (block_size=104)? */
+
+       ld1     {v25.8b-v28.8b}, [x1], #32
+       eor     v13.8b, v13.8b, v25.8b
+       eor     v14.8b, v14.8b, v26.8b
+       eor     v15.8b, v15.8b, v27.8b
+       eor     v16.8b, v16.8b, v28.8b
+       cmp     x3, #144
+       b.lt    3f      /* SHA3-256 or SHAKE256 (block_size=136)? */
+       b.eq    2f      /* SHA3-224 (block_size=144)? */
+
+       /* SHAKE128 (block_size=168) */
+       ld1     {v25.8b-v28.8b}, [x1], #32
+       eor     v17.8b, v17.8b, v25.8b
+       eor     v18.8b, v18.8b, v26.8b
+       eor     v19.8b, v19.8b, v27.8b
+       eor     v20.8b, v20.8b, v28.8b
+       b       3f
+2:
+       /* SHA3-224 (block_size=144) */
+       ld1     {v25.8b}, [x1], #8
+       eor     v17.8b, v17.8b, v25.8b
+
+3:     sub     w8, w8, #1
+
+       eor3    v29.16b,  v4.16b,  v9.16b, v14.16b
+       eor3    v26.16b,  v1.16b,  v6.16b, v11.16b
+       eor3    v28.16b,  v3.16b,  v8.16b, v13.16b
+       eor3    v25.16b,  v0.16b,  v5.16b, v10.16b
+       eor3    v27.16b,  v2.16b,  v7.16b, v12.16b
+       eor3    v29.16b, v29.16b, v19.16b, v24.16b
+       eor3    v26.16b, v26.16b, v16.16b, v21.16b
+       eor3    v28.16b, v28.16b, v18.16b, v23.16b
+       eor3    v25.16b, v25.16b, v15.16b, v20.16b
+       eor3    v27.16b, v27.16b, v17.16b, v22.16b
+
+       rax1    v30.2d, v29.2d, v26.2d  // bc[0]
+       rax1    v26.2d, v26.2d, v28.2d  // bc[2]
+       rax1    v28.2d, v28.2d, v25.2d  // bc[4]
+       rax1    v25.2d, v25.2d, v27.2d  // bc[1]
+       rax1    v27.2d, v27.2d, v29.2d  // bc[3]
+
+       eor      v0.16b,  v0.16b, v30.16b
+       xar      v29.2d,   v1.2d,  v25.2d, (64 - 1)
+       xar       v1.2d,   v6.2d,  v25.2d, (64 - 44)
+       xar       v6.2d,   v9.2d,  v28.2d, (64 - 20)
+       xar       v9.2d,  v22.2d,  v26.2d, (64 - 61)
+       xar      v22.2d,  v14.2d,  v28.2d, (64 - 39)
+       xar      v14.2d,  v20.2d,  v30.2d, (64 - 18)
+       xar      v31.2d,   v2.2d,  v26.2d, (64 - 62)
+       xar       v2.2d,  v12.2d,  v26.2d, (64 - 43)
+       xar      v12.2d,  v13.2d,  v27.2d, (64 - 25)
+       xar      v13.2d,  v19.2d,  v28.2d, (64 - 8)
+       xar      v19.2d,  v23.2d,  v27.2d, (64 - 56)
+       xar      v23.2d,  v15.2d,  v30.2d, (64 - 41)
+       xar      v15.2d,   v4.2d,  v28.2d, (64 - 27)
+       xar      v28.2d,  v24.2d,  v28.2d, (64 - 14)
+       xar      v24.2d,  v21.2d,  v25.2d, (64 - 2)
+       xar       v8.2d,   v8.2d,  v27.2d, (64 - 55)
+       xar       v4.2d,  v16.2d,  v25.2d, (64 - 45)
+       xar      v16.2d,   v5.2d,  v30.2d, (64 - 36)
+       xar       v5.2d,   v3.2d,  v27.2d, (64 - 28)
+       xar      v27.2d,  v18.2d,  v27.2d, (64 - 21)
+       xar       v3.2d,  v17.2d,  v26.2d, (64 - 15)
+       xar      v25.2d,  v11.2d,  v25.2d, (64 - 10)
+       xar      v26.2d,   v7.2d,  v26.2d, (64 - 6)
+       xar      v30.2d,  v10.2d,  v30.2d, (64 - 3)
+
+       bcax    v20.16b, v31.16b, v22.16b,  v8.16b
+       bcax    v21.16b,  v8.16b, v23.16b, v22.16b
+       bcax    v22.16b, v22.16b, v24.16b, v23.16b
+       bcax    v23.16b, v23.16b, v31.16b, v24.16b
+       bcax    v24.16b, v24.16b,  v8.16b, v31.16b
+
+       ld1r    {v31.2d}, [x9], #8
+
+       bcax    v17.16b, v25.16b, v19.16b,  v3.16b
+       bcax    v18.16b,  v3.16b, v15.16b, v19.16b
+       bcax    v19.16b, v19.16b, v16.16b, v15.16b
+       bcax    v15.16b, v15.16b, v25.16b, v16.16b
+       bcax    v16.16b, v16.16b,  v3.16b, v25.16b
+
+       bcax    v10.16b, v29.16b, v12.16b, v26.16b
+       bcax    v11.16b, v26.16b, v13.16b, v12.16b
+       bcax    v12.16b, v12.16b, v14.16b, v13.16b
+       bcax    v13.16b, v13.16b, v29.16b, v14.16b
+       bcax    v14.16b, v14.16b, v26.16b, v29.16b
+
+       bcax     v7.16b, v30.16b,  v9.16b,  v4.16b
+       bcax     v8.16b,  v4.16b,  v5.16b,  v9.16b
+       bcax     v9.16b,  v9.16b,  v6.16b,  v5.16b
+       bcax     v5.16b,  v5.16b, v30.16b,  v6.16b
+       bcax     v6.16b,  v6.16b,  v4.16b, v30.16b
+
+       bcax     v3.16b, v27.16b,  v0.16b, v28.16b
+       bcax     v4.16b, v28.16b,  v1.16b,  v0.16b
+       bcax     v0.16b,  v0.16b,  v2.16b,  v1.16b
+       bcax     v1.16b,  v1.16b, v27.16b,  v2.16b
+       bcax     v2.16b,  v2.16b, v28.16b, v27.16b
+
+       eor      v0.16b,  v0.16b, v31.16b
+
+       cbnz    w8, 3b
+       cond_yield 4f, x8, x9
+       cbnz    x2, 0b
+
+       /* save state */
+4:     st1     { v0.1d- v3.1d}, [x0], #32
+       st1     { v4.1d- v7.1d}, [x0], #32
+       st1     { v8.1d-v11.1d}, [x0], #32
+       st1     {v12.1d-v15.1d}, [x0], #32
+       st1     {v16.1d-v19.1d}, [x0], #32
+       st1     {v20.1d-v23.1d}, [x0], #32
+       st1     {v24.1d}, [x0]
+       mov     x0, x2
+       ret
+SYM_FUNC_END(sha3_ce_transform)
+
+       .section        ".rodata", "a"
+       .align          8
+.Lsha3_rcon:
+       .quad   0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+       .quad   0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+       .quad   0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+       .quad   0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+       .quad   0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+       .quad   0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+       .quad   0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+       .quad   0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644 (file)
index 0000000..6dd5183
--- /dev/null
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+                                   size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+                              size_t nblocks, size_t block_size)
+{
+       if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+               do {
+                       size_t rem;
+
+                       kernel_neon_begin();
+                       rem = sha3_ce_transform(state, data, nblocks,
+                                               block_size);
+                       kernel_neon_end();
+                       data += (nblocks - rem) * block_size;
+                       nblocks = rem;
+               } while (nblocks);
+       } else {
+               sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+       }
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+       if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+               /*
+                * Passing zeroes into sha3_ce_transform() gives the plain
+                * Keccak-f permutation, which is what we want here.  Any
+                * supported block size may be used.  Use SHA3_512_BLOCK_SIZE
+                * since it's the shortest.
+                */
+               static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+               kernel_neon_begin();
+               sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+               kernel_neon_end();
+       } else {
+               sha3_keccakf_generic(state);
+       }
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+       if (cpu_have_named_feature(SHA3))
+               static_branch_enable(&have_sha3);
+}