From: Eric Biggers Date: Sat, 12 Jul 2025 23:23:04 +0000 (-0700) Subject: lib/crypto: x86/sha1: Migrate optimized code into library X-Git-Tag: ceph-for-6.17-rc6~328^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f3d6cb3dc0394b866bc0d1e15157ce45844cf3d3;p=ceph-client.git lib/crypto: x86/sha1: Migrate optimized code into library Instead of exposing the x86-optimized SHA-1 code via x86-specific crypto_shash algorithms, instead just implement the sha1_blocks() library function. This is much simpler, it makes the SHA-1 library functions be x86-optimized, and it fixes the longstanding issue where the x86-optimized SHA-1 code was disabled by default. SHA-1 still remains available through crypto_shash, but individual architectures no longer need to handle it. To match sha1_blocks(), change the type of the nblocks parameter of the assembly functions from int to size_t. The assembly functions actually already treated it as size_t. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-14-ebiggers@kernel.org Signed-off-by: Eric Biggers --- diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index eb641a300154..94016c60561e 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -376,20 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_SHA1_SSSE3 - tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on 64BIT - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index d31348be8370..d402963d6b57 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -51,9 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif -obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S deleted file mode 100644 index 4b49bdc95265..000000000000 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Implement fast SHA-1 with AVX2 instructions. (x86_64) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Ilya Albrekht - * Maxim Locktyukhin - * Ronen Zohar - * Chandramouli Narayanan - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. - * - *This implementation is based on the previous SSSE3 release: - *Visit http://software.intel.com/en-us/articles/ - *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ - * - *Updates 20-byte SHA-1 record at start of 'state', from 'input', for - *even number of 'blocks' consecutive 64-byte blocks. - * - *extern "C" void sha1_transform_avx2( - * struct sha1_state *state, const u8* input, int blocks ); - */ - -#include - -#define CTX %rdi /* arg1 */ -#define BUF %rsi /* arg2 */ -#define CNT %rdx /* arg3 */ - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %eax -#define REG_E %edx -#define REG_TB %ebx -#define REG_TA %r12d -#define REG_RA %rcx -#define REG_RB %rsi -#define REG_RC %rdi -#define REG_RD %rax -#define REG_RE %rdx -#define REG_RTA %r12 -#define REG_RTB %rbx -#define REG_T1 %r11d -#define xmm_mov vmovups -#define avx2_zeroupper vzeroupper -#define RND_F1 1 -#define RND_F2 2 -#define RND_F3 3 - -.macro REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set TB, REG_TB - .set TA, REG_TA - - .set RA, REG_RA - .set RB, REG_RB - .set RC, REG_RC - .set RD, REG_RD - .set RE, REG_RE - - .set RTA, REG_RTA - .set RTB, REG_RTB - - .set T1, REG_T1 -.endm - -#define HASH_PTR %r9 -#define BLOCKS_CTR %r8 -#define BUFFER_PTR %r10 -#define BUFFER_PTR2 %r13 - -#define PRECALC_BUF %r14 -#define WK_BUF %r15 - -#define W_TMP %xmm0 -#define WY_TMP %ymm0 -#define WY_TMP2 %ymm9 - -# AVX2 variables -#define WY0 %ymm3 -#define WY4 %ymm5 -#define WY08 %ymm7 -#define WY12 %ymm8 -#define WY16 %ymm12 -#define WY20 %ymm13 -#define WY24 %ymm14 -#define WY28 %ymm15 - -#define YMM_SHUFB_BSWAP %ymm10 - -/* - * Keep 2 iterations precalculated at a time: - * - 80 DWORDs per iteration * 2 - */ -#define W_SIZE (80*2*2 +16) - -#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) -#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) - - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -.macro PRECALC_RESET_WY - .set WY_00, WY0 - .set WY_04, WY4 - .set WY_08, WY08 - .set WY_12, WY12 - .set WY_16, WY16 - .set WY_20, WY20 - .set WY_24, WY24 - .set WY_28, WY28 - .set WY_32, WY_00 -.endm - -.macro PRECALC_ROTATE_WY - /* Rotate macros */ - .set WY_32, WY_28 - .set WY_28, WY_24 - .set WY_24, WY_20 - .set WY_20, WY_16 - .set WY_16, WY_12 - .set WY_12, WY_08 - .set WY_08, WY_04 - .set WY_04, WY_00 - .set WY_00, WY_32 - - /* Define register aliases */ - .set WY, WY_00 - .set WY_minus_04, WY_04 - .set WY_minus_08, WY_08 - .set WY_minus_12, WY_12 - .set WY_minus_16, WY_16 - .set WY_minus_20, WY_20 - .set WY_minus_24, WY_24 - .set WY_minus_28, WY_28 - .set WY_minus_32, WY -.endm - -.macro PRECALC_00_15 - .if (i == 0) # Initialize and rotate registers - PRECALC_RESET_WY - PRECALC_ROTATE_WY - .endif - - /* message scheduling pre-compute for rounds 0-15 */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vmovdqu (i * 2)(BUFFER_PTR), W_TMP - .elseif ((i & 7) == 1) - vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ - WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY - .elseif ((i & 7) == 4) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - .elseif ((i & 7) == 7) - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_16_31 - /* - * message scheduling pre-compute for rounds 16-31 - * calculating last 32 w[i] values in 8 XMM registers - * pre-calculate K+w[i] values and store to mem - * for later load by ALU add instruction - * - * "brute force" vectorization for rounds 16-31 only - * due to w[i]->w[i-3] dependency - */ - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - /* w[i-14] */ - vpalignr $8, WY_minus_16, WY_minus_12, WY - vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ - .elseif ((i & 7) == 1) - vpxor WY_minus_08, WY, WY - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 2) - vpxor WY_TMP, WY, WY - vpslldq $12, WY, WY_TMP2 - .elseif ((i & 7) == 3) - vpslld $1, WY, WY_TMP - vpsrld $31, WY, WY - .elseif ((i & 7) == 4) - vpor WY, WY_TMP, WY_TMP - vpslld $2, WY_TMP2, WY - .elseif ((i & 7) == 5) - vpsrld $30, WY_TMP2, WY_TMP2 - vpxor WY, WY_TMP, WY_TMP - .elseif ((i & 7) == 7) - vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC_32_79 - /* - * in SHA-1 specification: - * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: - * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization - * since w[i]=>w[i-3] dependency is broken - */ - - .if ((i & 7) == 0) - /* - * blended AVX2 and ALU instruction scheduling - * 1 vector iteration per 8 rounds - */ - vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP - .elseif ((i & 7) == 1) - /* W is W_minus_32 before xor */ - vpxor WY_minus_28, WY, WY - .elseif ((i & 7) == 2) - vpxor WY_minus_16, WY_TMP, WY_TMP - .elseif ((i & 7) == 3) - vpxor WY_TMP, WY, WY - .elseif ((i & 7) == 4) - vpslld $2, WY, WY_TMP - .elseif ((i & 7) == 5) - vpsrld $30, WY, WY - vpor WY, WY_TMP, WY - .elseif ((i & 7) == 7) - vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP - vmovdqu WY_TMP, PRECALC_WK(i&~7) - - PRECALC_ROTATE_WY - .endif -.endm - -.macro PRECALC r, s - .set i, \r - - .if (i < 40) - .set K_XMM, 32*0 - .elseif (i < 80) - .set K_XMM, 32*1 - .elseif (i < 120) - .set K_XMM, 32*2 - .else - .set K_XMM, 32*3 - .endif - - .if (i<32) - PRECALC_00_15 \s - .elseif (i<64) - PRECALC_16_31 \s - .elseif (i < 160) - PRECALC_32_79 \s - .endif -.endm - -.macro ROTATE_STATE - .set T_REG, E - .set E, D - .set D, C - .set C, B - .set B, TB - .set TB, A - .set A, T_REG - - .set T_REG, RE - .set RE, RD - .set RD, RC - .set RC, RB - .set RB, RTB - .set RTB, RA - .set RA, T_REG -.endm - -/* Macro relies on saved ROUND_Fx */ - -.macro RND_FUN f, r - .if (\f == RND_F1) - ROUND_F1 \r - .elseif (\f == RND_F2) - ROUND_F2 \r - .elseif (\f == RND_F3) - ROUND_F3 \r - .endif -.endm - -.macro RR r - .set round_id, (\r % 80) - - .if (round_id == 0) /* Precalculate F for first round */ - .set ROUND_FUNC, RND_F1 - mov B, TB - - rorx $(32-30), B, B /* b>>>2 */ - andn D, TB, T1 - and C, TB - xor T1, TB - .endif - - RND_FUN ROUND_FUNC, \r - ROTATE_STATE - - .if (round_id == 18) - .set ROUND_FUNC, RND_F2 - .elseif (round_id == 38) - .set ROUND_FUNC, RND_F3 - .elseif (round_id == 58) - .set ROUND_FUNC, RND_F2 - .endif - - .set round_id, ( (\r+1) % 80) - - RND_FUN ROUND_FUNC, (\r+1) - ROTATE_STATE -.endm - -.macro ROUND_F1 r - add WK(\r), E - - andn C, A, T1 /* ~b&d */ - lea (RE,RTB), E /* Add F from the previous round */ - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30),A, TB /* b>>>2 for next round */ - - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - /* - * Calculate F for the next round - * (b & c) ^ andn[b, d] - */ - and B, A /* b&c */ - xor T1, A /* F1 = (b&c) ^ (~b&d) */ - - lea (RE,RTA), E /* E += A >>> 5 */ -.endm - -.macro ROUND_F2 r - add WK(\r), E - lea (RE,RTB), E /* Add F from the previous round */ - - /* Calculate F for the next round */ - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - .if ((round_id) < 79) - rorx $(32-30), A, TB /* b>>>2 for next round */ - .endif - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - .if ((round_id) < 79) - xor B, A - .endif - - add TA, E /* E += A >>> 5 */ - - .if ((round_id) < 79) - xor C, A - .endif -.endm - -.macro ROUND_F3 r - add WK(\r), E - PRECALC (\r) /* msg scheduling for next 2 blocks */ - - lea (RE,RTB), E /* Add F from the previous round */ - - mov B, T1 - or A, T1 - - rorx $(32-5), A, TA /* T2 = A >>> 5 */ - rorx $(32-30), A, TB /* b>>>2 for next round */ - - /* Calculate F for the next round - * (b and c) or (d and (b or c)) - */ - and C, T1 - and B, A - or T1, A - - add TA, E /* E += A >>> 5 */ - -.endm - -/* Add constant only if (%2 > %3) condition met (uses RTA as temp) - * %1 + %2 >= %3 ? %4 : 0 - */ -.macro ADD_IF_GE a, b, c, d - mov \a, RTA - add $\d, RTA - cmp $\c, \b - cmovge RTA, \a -.endm - -/* - * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining - */ -.macro SHA1_PIPELINED_MAIN_BODY - - REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - mov %rsp, PRECALC_BUF - lea (2*4*80+32)(%rsp), WK_BUF - - # Precalc WK for first 2 blocks - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 - .set i, 0 - .rept 160 - PRECALC i - .set i, i + 1 - .endr - - /* Go to next block if needed */ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - xchg WK_BUF, PRECALC_BUF - - .align 32 -.L_loop: - /* - * code loops through more than one block - * we use K_BASE value as a signal of a last block, - * it is set below by: cmovae BUFFER_PTR, K_BASE - */ - test BLOCKS_CTR, BLOCKS_CTR - jnz .L_begin - .align 32 - jmp .L_end - .align 32 -.L_begin: - - /* - * Do first block - * rounds: 0,2,4,6,8 - */ - .set j, 0 - .rept 5 - RR j - .set j, j+2 - .endr - - /* - * rounds: - * 10,12,14,16,18 - * 20,22,24,26,28 - * 30,32,34,36,38 - * 40,42,44,46,48 - * 50,52,54,56,58 - */ - .rept 25 - RR j - .set j, j+2 - .endr - - /* Update Counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 - /* - * rounds - * 60,62,64,66,68 - * 70,72,74,76,78 - */ - .rept 10 - RR j - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - test BLOCKS_CTR, BLOCKS_CTR - jz .L_loop - - mov TB, B - - /* Process second block */ - /* - * rounds - * 0+80, 2+80, 4+80, 6+80, 8+80 - * 10+80,12+80,14+80,16+80,18+80 - */ - - .set j, 0 - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 20+80,22+80,24+80,26+80,28+80 - * 30+80,32+80,34+80,36+80,38+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* - * rounds - * 40+80,42+80,44+80,46+80,48+80 - * 50+80,52+80,54+80,56+80,58+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - /* update counter */ - sub $1, BLOCKS_CTR - /* Move to the next block only if needed*/ - ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - - /* - * rounds - * 60+80,62+80,64+80,66+80,68+80 - * 70+80,72+80,74+80,76+80,78+80 - */ - .rept 10 - RR j+80 - .set j, j+2 - .endr - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), TB - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - /* Reset state for AVX2 reg permutation */ - mov A, TA - mov TB, A - mov C, TB - mov E, C - mov D, B - mov TA, D - - REGALLOC - - xchg WK_BUF, PRECALC_BUF - - jmp .L_loop - - .align 32 -.L_end: - -.endm -/* - * macro implements SHA-1 function's body for several 64-byte blocks - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_FUNC_START(\name) - - push %rbx - push %r12 - push %r13 - push %r14 - push %r15 - - RESERVE_STACK = (W_SIZE*4 + 8+24) - - /* Align stack */ - push %rbp - mov %rsp, %rbp - and $~(0x20-1), %rsp - sub $RESERVE_STACK, %rsp - - avx2_zeroupper - - /* Setup initial values */ - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - mov BUF, BUFFER_PTR2 - mov CNT, BLOCKS_CTR - - xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - avx2_zeroupper - - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - - RET - - SYM_FUNC_END(\name) -.endm - -.section .rodata - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.align 128 -K_XMM_AR: - .long K1, K1, K1, K1 - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f -.text - -SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S deleted file mode 100644 index cade913d4882..000000000000 --- a/arch/x86/crypto/sha1_ni_asm.S +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley - * Tim Chen - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include -#include - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -/* gcc conversion */ -#define FRAME_SIZE 32 /* space for 2x16 bytes */ - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define SHUF_MASK %xmm7 - - -/* - * Intel SHA Extensions optimized implementation of a SHA-1 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * - * void sha1_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks) - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ -.text -SYM_TYPED_FUNC_START(sha1_ni_transform) - push %rbp - mov %rsp, %rbp - sub $FRAME_SIZE, %rsp - and $~0xF, %rsp - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* load initial hash values */ - pinsrd $3, 1*16(DIGEST_PTR), E0 - movdqu 0*16(DIGEST_PTR), ABCD - pand UPPER_WORD_MASK(%rip), E0 - pshufd $0x1B, ABCD, ABCD - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa E0, (0*16)(%rsp) - movdqa ABCD, (1*16)(%rsp) - - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG0 - pshufb SHUF_MASK, MSG0 - paddd MSG0, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG1 - pshufb SHUF_MASK, MSG1 - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG2 - pshufb SHUF_MASK, MSG2 - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG3 - pshufb SHUF_MASK, MSG3 - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - pxor MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - pxor MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - pxor MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - movdqa ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - pxor MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - movdqa ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - pxor MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - movdqa ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - movdqa ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte (0*16)(%rsp), E0 - paddd (1*16)(%rsp), ABCD - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - pshufd $0x1B, ABCD, ABCD - movdqu ABCD, 0*16(DIGEST_PTR) - pextrd $3, E0, 1*16(DIGEST_PTR) - -.Ldone_hash: - mov %rbp, %rsp - pop %rbp - - RET -SYM_FUNC_END(sha1_ni_transform) - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 -.align 16 -UPPER_WORD_MASK: - .octa 0xFFFFFFFF000000000000000000000000 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S deleted file mode 100644 index f54988c80eb4..000000000000 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ /dev/null @@ -1,554 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental - * SSE3 instruction set extensions introduced in Intel Core Microarchitecture - * processors. CPUs supporting Intel(R) AVX extensions will get an additional - * boost. - * - * This work was inspired by the vectorized implementation of Dean Gaudet. - * Additional information on it can be found at: - * http://www.arctic.org/~dean/crypto/sha1.html - * - * It was improved upon with more efficient vectorization of the message - * scheduling. This implementation has also been optimized for all current and - * several future generations of Intel CPUs. - * - * See this article for more information about the implementation details: - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ - * - * Copyright (C) 2010, Intel Corp. - * Authors: Maxim Locktyukhin - * Ronen Zohar - * - * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: - * Author: Mathias Krause - */ - -#include -#include - -#define CTX %rdi // arg1 -#define BUF %rsi // arg2 -#define CNT %rdx // arg3 - -#define REG_A %ecx -#define REG_B %esi -#define REG_C %edi -#define REG_D %r12d -#define REG_E %edx - -#define REG_T1 %eax -#define REG_T2 %ebx - -#define K_BASE %r8 -#define HASH_PTR %r9 -#define BUFFER_PTR %r10 -#define BUFFER_END %r11 - -#define W_TMP1 %xmm0 -#define W_TMP2 %xmm9 - -#define W0 %xmm1 -#define W4 %xmm2 -#define W8 %xmm3 -#define W12 %xmm4 -#define W16 %xmm5 -#define W20 %xmm6 -#define W24 %xmm7 -#define W28 %xmm8 - -#define XMM_SHUFB_BSWAP %xmm10 - -/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ -#define WK(t) (((t) & 15) * 4)(%rsp) -#define W_PRECALC_AHEAD 16 - -/* - * This macro implements the SHA-1 function's body for single 64-byte block - * param: function's name - */ -.macro SHA1_VECTOR_ASM name - SYM_TYPED_FUNC_START(\name) - - push %rbx - push %r12 - push %rbp - mov %rsp, %rbp - - sub $64, %rsp # allocate workspace - and $~15, %rsp # align stack - - mov CTX, HASH_PTR - mov BUF, BUFFER_PTR - - shl $6, CNT # multiply by 64 - add BUF, CNT - mov CNT, BUFFER_END - - lea K_XMM_AR(%rip), K_BASE - xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP - - SHA1_PIPELINED_MAIN_BODY - - # cleanup workspace - mov $8, %ecx - mov %rsp, %rdi - xor %eax, %eax - rep stosq - - mov %rbp, %rsp # deallocate workspace - pop %rbp - pop %r12 - pop %rbx - RET - - SYM_FUNC_END(\name) -.endm - -/* - * This macro implements 80 rounds of SHA-1 for one 64-byte block - */ -.macro SHA1_PIPELINED_MAIN_BODY - INIT_REGALLOC - - mov (HASH_PTR), A - mov 4(HASH_PTR), B - mov 8(HASH_PTR), C - mov 12(HASH_PTR), D - mov 16(HASH_PTR), E - - .set i, 0 - .rept W_PRECALC_AHEAD - W_PRECALC i - .set i, (i+1) - .endr - -.align 4 -1: - RR F1,A,B,C,D,E,0 - RR F1,D,E,A,B,C,2 - RR F1,B,C,D,E,A,4 - RR F1,E,A,B,C,D,6 - RR F1,C,D,E,A,B,8 - - RR F1,A,B,C,D,E,10 - RR F1,D,E,A,B,C,12 - RR F1,B,C,D,E,A,14 - RR F1,E,A,B,C,D,16 - RR F1,C,D,E,A,B,18 - - RR F2,A,B,C,D,E,20 - RR F2,D,E,A,B,C,22 - RR F2,B,C,D,E,A,24 - RR F2,E,A,B,C,D,26 - RR F2,C,D,E,A,B,28 - - RR F2,A,B,C,D,E,30 - RR F2,D,E,A,B,C,32 - RR F2,B,C,D,E,A,34 - RR F2,E,A,B,C,D,36 - RR F2,C,D,E,A,B,38 - - RR F3,A,B,C,D,E,40 - RR F3,D,E,A,B,C,42 - RR F3,B,C,D,E,A,44 - RR F3,E,A,B,C,D,46 - RR F3,C,D,E,A,B,48 - - RR F3,A,B,C,D,E,50 - RR F3,D,E,A,B,C,52 - RR F3,B,C,D,E,A,54 - RR F3,E,A,B,C,D,56 - RR F3,C,D,E,A,B,58 - - add $64, BUFFER_PTR # move to the next 64-byte block - cmp BUFFER_END, BUFFER_PTR # if the current is the last one use - cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun - - RR F4,A,B,C,D,E,60 - RR F4,D,E,A,B,C,62 - RR F4,B,C,D,E,A,64 - RR F4,E,A,B,C,D,66 - RR F4,C,D,E,A,B,68 - - RR F4,A,B,C,D,E,70 - RR F4,D,E,A,B,C,72 - RR F4,B,C,D,E,A,74 - RR F4,E,A,B,C,D,76 - RR F4,C,D,E,A,B,78 - - UPDATE_HASH (HASH_PTR), A - UPDATE_HASH 4(HASH_PTR), B - UPDATE_HASH 8(HASH_PTR), C - UPDATE_HASH 12(HASH_PTR), D - UPDATE_HASH 16(HASH_PTR), E - - RESTORE_RENAMED_REGS - cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end - jne 1b -.endm - -.macro INIT_REGALLOC - .set A, REG_A - .set B, REG_B - .set C, REG_C - .set D, REG_D - .set E, REG_E - .set T1, REG_T1 - .set T2, REG_T2 -.endm - -.macro RESTORE_RENAMED_REGS - # order is important (REG_C is where it should be) - mov B, REG_B - mov D, REG_D - mov A, REG_A - mov E, REG_E -.endm - -.macro SWAP_REG_NAMES a, b - .set _T, \a - .set \a, \b - .set \b, _T -.endm - -.macro F1 b, c, d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - xor \d, T1 - and \b, T1 - xor \d, T1 -.endm - -.macro F2 b, c, d - mov \d, T1 - SWAP_REG_NAMES \d, T1 - xor \c, T1 - xor \b, T1 -.endm - -.macro F3 b, c ,d - mov \c, T1 - SWAP_REG_NAMES \c, T1 - mov \b, T2 - or \b, T1 - and \c, T2 - and \d, T1 - or T2, T1 -.endm - -.macro F4 b, c, d - F2 \b, \c, \d -.endm - -.macro UPDATE_HASH hash, val - add \hash, \val - mov \val, \hash -.endm - -/* - * RR does two rounds of SHA-1 back to back with W[] pre-calc - * t1 = F(b, c, d); e += w(i) - * e += t1; b <<= 30; d += w(i+1); - * t1 = F(a, b, c); - * d += t1; a <<= 5; - * e += a; - * t1 = e; a >>= 7; - * t1 <<= 5; - * d += t1; - */ -.macro RR F, a, b, c, d, e, round - add WK(\round), \e - \F \b, \c, \d # t1 = F(b, c, d); - W_PRECALC (\round + W_PRECALC_AHEAD) - rol $30, \b - add T1, \e - add WK(\round + 1), \d - - \F \a, \b, \c - W_PRECALC (\round + W_PRECALC_AHEAD + 1) - rol $5, \a - add \a, \e - add T1, \d - ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) - .set i, ((\r) % 80) # pre-compute for the next iteration - .if (i == 0) - W_PRECALC_RESET - .endif - W_PRECALC_00_15 - .elseif (i<32) - W_PRECALC_16_31 - .elseif (i < 80) // rounds 32-79 - W_PRECALC_32_79 - .endif -.endm - -.macro W_PRECALC_RESET - .set W, W0 - .set W_minus_04, W4 - .set W_minus_08, W8 - .set W_minus_12, W12 - .set W_minus_16, W16 - .set W_minus_20, W20 - .set W_minus_24, W24 - .set W_minus_28, W28 - .set W_minus_32, W -.endm - -.macro W_PRECALC_ROTATE - .set W_minus_32, W_minus_28 - .set W_minus_28, W_minus_24 - .set W_minus_24, W_minus_20 - .set W_minus_20, W_minus_16 - .set W_minus_16, W_minus_12 - .set W_minus_12, W_minus_08 - .set W_minus_08, W_minus_04 - .set W_minus_04, W - .set W, W_minus_32 -.endm - -.macro W_PRECALC_SSSE3 - -.macro W_PRECALC_00_15 - W_PRECALC_00_15_SSSE3 -.endm -.macro W_PRECALC_16_31 - W_PRECALC_16_31_SSSE3 -.endm -.macro W_PRECALC_32_79 - W_PRECALC_32_79_SSSE3 -.endm - -/* message scheduling pre-compute for rounds 0-15 */ -.macro W_PRECALC_00_15_SSSE3 - .if ((i & 3) == 0) - movdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - pshufb XMM_SHUFB_BSWAP, W_TMP1 - movdqa W_TMP1, W - .elseif ((i & 3) == 2) - paddd (K_BASE), W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 16-31 - * - * - calculating last 32 w[i] values in 8 XMM registers - * - pre-calculate K+w[i] values and store to mem, for later load by ALU add - * instruction - * - * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] - * dependency, but improves for 32-79 - */ -.macro W_PRECALC_16_31_SSSE3 - # blended scheduling of vector and scalar instruction streams, one 4-wide - # vector iteration / 4 scalar rounds - .if ((i & 3) == 0) - movdqa W_minus_12, W - palignr $8, W_minus_16, W # w[i-14] - movdqa W_minus_04, W_TMP1 - psrldq $4, W_TMP1 # w[i-3] - pxor W_minus_08, W - .elseif ((i & 3) == 1) - pxor W_minus_16, W_TMP1 - pxor W_TMP1, W - movdqa W, W_TMP2 - movdqa W, W_TMP1 - pslldq $12, W_TMP2 - .elseif ((i & 3) == 2) - psrld $31, W - pslld $1, W_TMP1 - por W, W_TMP1 - movdqa W_TMP2, W - psrld $30, W_TMP2 - pslld $2, W - .elseif ((i & 3) == 3) - pxor W, W_TMP1 - pxor W_TMP2, W_TMP1 - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -/* message scheduling pre-compute for rounds 32-79 - * - * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 - * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 - * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken - */ -.macro W_PRECALC_32_79_SSSE3 - .if ((i & 3) == 0) - movdqa W_minus_04, W_TMP1 - pxor W_minus_28, W # W is W_minus_32 before xor - palignr $8, W_minus_08, W_TMP1 - .elseif ((i & 3) == 1) - pxor W_minus_16, W - pxor W_TMP1, W - movdqa W, W_TMP1 - .elseif ((i & 3) == 2) - psrld $30, W - pslld $2, W_TMP1 - por W, W_TMP1 - .elseif ((i & 3) == 3) - movdqa W_TMP1, W - paddd K_XMM(K_BASE), W_TMP1 - movdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_SSSE3 - - -#define K1 0x5a827999 -#define K2 0x6ed9eba1 -#define K3 0x8f1bbcdc -#define K4 0xca62c1d6 - -.section .rodata -.align 16 - -K_XMM_AR: - .long K1, K1, K1, K1 - .long K2, K2, K2, K2 - .long K3, K3, K3, K3 - .long K4, K4, K4, K4 - -BSWAP_SHUFB_CTL: - .long 0x00010203 - .long 0x04050607 - .long 0x08090a0b - .long 0x0c0d0e0f - - -.section .text - -W_PRECALC_SSSE3 -.macro xmm_mov a, b - movdqu \a,\b -.endm - -/* - * SSSE3 optimized implementation: - * - * extern "C" void sha1_transform_ssse3(struct sha1_state *state, - * const u8 *data, int blocks); - * - * Note that struct sha1_state is assumed to begin with u32 state[5]. - */ -SHA1_VECTOR_ASM sha1_transform_ssse3 - -.macro W_PRECALC_AVX - -.purgem W_PRECALC_00_15 -.macro W_PRECALC_00_15 - W_PRECALC_00_15_AVX -.endm -.purgem W_PRECALC_16_31 -.macro W_PRECALC_16_31 - W_PRECALC_16_31_AVX -.endm -.purgem W_PRECALC_32_79 -.macro W_PRECALC_32_79 - W_PRECALC_32_79_AVX -.endm - -.macro W_PRECALC_00_15_AVX - .if ((i & 3) == 0) - vmovdqu (i*4)(BUFFER_PTR), W_TMP1 - .elseif ((i & 3) == 1) - vpshufb XMM_SHUFB_BSWAP, W_TMP1, W - .elseif ((i & 3) == 2) - vpaddd (K_BASE), W, W_TMP1 - .elseif ((i & 3) == 3) - vmovdqa W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_16_31_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] - vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] - vpxor W_minus_08, W, W - vpxor W_minus_16, W_TMP1, W_TMP1 - .elseif ((i & 3) == 1) - vpxor W_TMP1, W, W - vpslldq $12, W, W_TMP2 - vpslld $1, W, W_TMP1 - .elseif ((i & 3) == 2) - vpsrld $31, W, W - vpor W, W_TMP1, W_TMP1 - vpslld $2, W_TMP2, W - vpsrld $30, W_TMP2, W_TMP2 - .elseif ((i & 3) == 3) - vpxor W, W_TMP1, W_TMP1 - vpxor W_TMP2, W_TMP1, W - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.macro W_PRECALC_32_79_AVX - .if ((i & 3) == 0) - vpalignr $8, W_minus_08, W_minus_04, W_TMP1 - vpxor W_minus_28, W, W # W is W_minus_32 before xor - .elseif ((i & 3) == 1) - vpxor W_minus_16, W_TMP1, W_TMP1 - vpxor W_TMP1, W, W - .elseif ((i & 3) == 2) - vpslld $2, W, W_TMP1 - vpsrld $30, W, W - vpor W, W_TMP1, W - .elseif ((i & 3) == 3) - vpaddd K_XMM(K_BASE), W, W_TMP1 - vmovdqu W_TMP1, WK(i&~3) - W_PRECALC_ROTATE - .endif -.endm - -.endm // W_PRECALC_AVX - -W_PRECALC_AVX -.purgem xmm_mov -.macro xmm_mov a, b - vmovdqu \a,\b -.endm - - -/* AVX optimized implementation: - * extern "C" void sha1_transform_avx(struct sha1_state *state, - * const u8 *data, int blocks); - */ -SHA1_VECTOR_ASM sha1_transform_avx diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c deleted file mode 100644 index 826579a7473c..000000000000 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * - * Glue code for the SHA1 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha1_generic.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - * Copyright (c) Chandramouli Narayanan - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include - -static const struct x86_cpu_id module_cpu_ids[] = { - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static inline int sha1_update_x86(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) -{ - int remain; - - /* - * Make sure struct sha1_state begins directly with the SHA1 - * 160-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - - kernel_fpu_begin(); - remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return remain; -} - -static inline int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, - sha1_block_fn *sha1_xform) -{ - kernel_fpu_begin(); - sha1_base_do_finup(desc, data, len, sha1_xform); - kernel_fpu_end(); - - return sha1_base_finish(desc, out); -} - -asmlinkage void sha1_transform_ssse3(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_transform_ssse3); -} - -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_ssse3); -} - -static struct shash_alg sha1_ssse3_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ssse3_update, - .finup = sha1_ssse3_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ssse3", - .cra_priority = 150, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shash(&sha1_ssse3_alg); - return 0; -} - -static void unregister_sha1_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shash(&sha1_ssse3_alg); -} - -asmlinkage void sha1_transform_avx(struct sha1_state *state, - const u8 *data, int blocks); - -static int sha1_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_transform_avx); -} - -static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_transform_avx); -} - -static struct shash_alg sha1_avx_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx_update, - .finup = sha1_avx_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx", - .cra_priority = 160, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha1_avx(void) -{ - if (avx_usable()) - return crypto_register_shash(&sha1_avx_alg); - return 0; -} - -static void unregister_sha1_avx(void) -{ - if (avx_usable()) - crypto_unregister_shash(&sha1_avx_alg); -} - -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks); - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) - && boot_cpu_has(X86_FEATURE_BMI1) - && boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static inline void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) -{ - /* Select the optimal transform based on data block size */ - if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(state, data, blocks); - else - sha1_transform_avx(state, data, blocks); -} - -static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_apply_transform_avx2); -} - -static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); -} - -static struct shash_alg sha1_avx2_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_avx2_update, - .finup = sha1_avx2_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-avx2", - .cra_priority = 170, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shash(&sha1_avx2_alg); - return 0; -} - -static void unregister_sha1_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shash(&sha1_avx2_alg); -} - -asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, - int rounds); - -static int sha1_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_update_x86(desc, data, len, sha1_ni_transform); -} - -static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha1_finup(desc, data, len, out, sha1_ni_transform); -} - -static struct shash_alg sha1_ni_alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_ni_update, - .finup = sha1_ni_finup, - .descsize = SHA1_STATE_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ni", - .cra_priority = 250, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int register_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shash(&sha1_ni_alg); - return 0; -} - -static void unregister_sha1_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shash(&sha1_ni_alg); -} - -static int __init sha1_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha1_ssse3()) - goto fail; - - if (register_sha1_avx()) { - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_avx2()) { - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - if (register_sha1_ni()) { - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha1_ssse3_mod_fini(void) -{ - unregister_sha1_ni(); - unregister_sha1_avx2(); - unregister_sha1_avx(); - unregister_sha1_ssse3(); -} - -module_init(sha1_ssse3_mod_init); -module_exit(sha1_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_ALIAS_CRYPTO("sha1-ssse3"); -MODULE_ALIAS_CRYPTO("sha1-avx"); -MODULE_ALIAS_CRYPTO("sha1-avx2"); -MODULE_ALIAS_CRYPTO("sha1-ni"); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 019034f2bb53..dc34441a80bd 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -152,6 +152,7 @@ config CRYPTO_LIB_SHA1_ARCH default y if PPC default y if S390 default y if SPARC64 + default y if X86_64 config CRYPTO_LIB_SHA256 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 7897da2de623..b09fdb7f4b4c 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -82,6 +82,9 @@ libsha1-y += powerpc/sha1-powerpc-asm.o libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o endif libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o +libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \ + x86/sha1-avx2-asm.o \ + x86/sha1-ni-asm.o endif # CONFIG_CRYPTO_LIB_SHA1_ARCH ################################################################################ diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S new file mode 100644 index 000000000000..91b2dc6db179 --- /dev/null +++ b/lib/crypto/x86/sha1-avx2-asm.S @@ -0,0 +1,697 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht + * Maxim Locktyukhin + * Ronen Zohar + * Chandramouli Narayanan + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * void sha1_transform_avx2(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ + +#include + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %r11d +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define HASH_PTR %r9 +#define BLOCKS_CTR %r8 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu (i * 2)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +.L_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + test BLOCKS_CTR, BLOCKS_CTR + jnz .L_begin + .align 32 + jmp .L_end + .align 32 +.L_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + test BLOCKS_CTR, BLOCKS_CTR + jz .L_loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp .L_loop + + .align 32 +.L_end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + SYM_FUNC_START(\name) + + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + push %rbp + mov %rsp, %rbp + and $~(0x20-1), %rsp + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + /* Setup initial values */ + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + mov %rbp, %rsp + pop %rbp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + RET + + SYM_FUNC_END(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S new file mode 100644 index 000000000000..3989b0642ff5 --- /dev/null +++ b/lib/crypto/x86/sha1-ni-asm.S @@ -0,0 +1,299 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +/* gcc conversion */ +#define FRAME_SIZE 32 /* space for 2x16 bytes */ + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + +/* + * Intel SHA Extensions optimized implementation of a SHA-1 block function + * + * This function takes a pointer to the current SHA-1 state, a pointer to the + * input data, and the number of 64-byte blocks to process. Once all blocks + * have been processed, the state is updated with the new state. This function + * only processes complete blocks. State initialization, buffering of partial + * blocks, and digest finalization are expected to be handled elsewhere. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha1_ni_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks) + */ +.text +SYM_FUNC_START(sha1_ni_transform) + push %rbp + mov %rsp, %rbp + sub $FRAME_SIZE, %rsp + and $~0xF, %rsp + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* load initial hash values */ + pinsrd $3, 1*16(DIGEST_PTR), E0 + movdqu 0*16(DIGEST_PTR), ABCD + pand UPPER_WORD_MASK(%rip), E0 + pshufd $0x1B, ABCD, ABCD + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa E0, (0*16)(%rsp) + movdqa ABCD, (1*16)(%rsp) + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + pxor MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte (0*16)(%rsp), E0 + paddd (1*16)(%rsp), ABCD + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, ABCD, ABCD + movdqu ABCD, 0*16(DIGEST_PTR) + pextrd $3, E0, 1*16(DIGEST_PTR) + +.Ldone_hash: + mov %rbp, %rsp + pop %rbp + + RET +SYM_FUNC_END(sha1_ni_transform) + +.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 +.align 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 +.align 16 +UPPER_WORD_MASK: + .octa 0xFFFFFFFF000000000000000000000000 diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S new file mode 100644 index 000000000000..78b48cb09c5b --- /dev/null +++ b/lib/crypto/x86/sha1-ssse3-and-avx.S @@ -0,0 +1,551 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental + * SSE3 instruction set extensions introduced in Intel Core Microarchitecture + * processors. CPUs supporting Intel(R) AVX extensions will get an additional + * boost. + * + * This work was inspired by the vectorized implementation of Dean Gaudet. + * Additional information on it can be found at: + * http://www.arctic.org/~dean/crypto/sha1.html + * + * It was improved upon with more efficient vectorization of the message + * scheduling. This implementation has also been optimized for all current and + * several future generations of Intel CPUs. + * + * See this article for more information about the implementation details: + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * Copyright (C) 2010, Intel Corp. + * Authors: Maxim Locktyukhin + * Ronen Zohar + * + * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: + * Author: Mathias Krause + */ + +#include + +#define CTX %rdi // arg1 +#define BUF %rsi // arg2 +#define CNT %rdx // arg3 + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %r12d +#define REG_E %edx + +#define REG_T1 %eax +#define REG_T2 %ebx + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_END %r11 + +#define W_TMP1 %xmm0 +#define W_TMP2 %xmm9 + +#define W0 %xmm1 +#define W4 %xmm2 +#define W8 %xmm3 +#define W12 %xmm4 +#define W16 %xmm5 +#define W20 %xmm6 +#define W24 %xmm7 +#define W28 %xmm8 + +#define XMM_SHUFB_BSWAP %xmm10 + +/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ +#define WK(t) (((t) & 15) * 4)(%rsp) +#define W_PRECALC_AHEAD 16 + +/* + * This macro implements the SHA-1 function's body for single 64-byte block + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + SYM_FUNC_START(\name) + + push %rbx + push %r12 + push %rbp + mov %rsp, %rbp + + sub $64, %rsp # allocate workspace + and $~15, %rsp # align stack + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + shl $6, CNT # multiply by 64 + add BUF, CNT + mov CNT, BUFFER_END + + lea K_XMM_AR(%rip), K_BASE + xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + # cleanup workspace + mov $8, %ecx + mov %rsp, %rdi + xor %eax, %eax + rep stosq + + mov %rbp, %rsp # deallocate workspace + pop %rbp + pop %r12 + pop %rbx + RET + + SYM_FUNC_END(\name) +.endm + +/* + * This macro implements 80 rounds of SHA-1 for one 64-byte block + */ +.macro SHA1_PIPELINED_MAIN_BODY + INIT_REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + .set i, 0 + .rept W_PRECALC_AHEAD + W_PRECALC i + .set i, (i+1) + .endr + +.align 4 +1: + RR F1,A,B,C,D,E,0 + RR F1,D,E,A,B,C,2 + RR F1,B,C,D,E,A,4 + RR F1,E,A,B,C,D,6 + RR F1,C,D,E,A,B,8 + + RR F1,A,B,C,D,E,10 + RR F1,D,E,A,B,C,12 + RR F1,B,C,D,E,A,14 + RR F1,E,A,B,C,D,16 + RR F1,C,D,E,A,B,18 + + RR F2,A,B,C,D,E,20 + RR F2,D,E,A,B,C,22 + RR F2,B,C,D,E,A,24 + RR F2,E,A,B,C,D,26 + RR F2,C,D,E,A,B,28 + + RR F2,A,B,C,D,E,30 + RR F2,D,E,A,B,C,32 + RR F2,B,C,D,E,A,34 + RR F2,E,A,B,C,D,36 + RR F2,C,D,E,A,B,38 + + RR F3,A,B,C,D,E,40 + RR F3,D,E,A,B,C,42 + RR F3,B,C,D,E,A,44 + RR F3,E,A,B,C,D,46 + RR F3,C,D,E,A,B,48 + + RR F3,A,B,C,D,E,50 + RR F3,D,E,A,B,C,52 + RR F3,B,C,D,E,A,54 + RR F3,E,A,B,C,D,56 + RR F3,C,D,E,A,B,58 + + add $64, BUFFER_PTR # move to the next 64-byte block + cmp BUFFER_END, BUFFER_PTR # if the current is the last one use + cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun + + RR F4,A,B,C,D,E,60 + RR F4,D,E,A,B,C,62 + RR F4,B,C,D,E,A,64 + RR F4,E,A,B,C,D,66 + RR F4,C,D,E,A,B,68 + + RR F4,A,B,C,D,E,70 + RR F4,D,E,A,B,C,72 + RR F4,B,C,D,E,A,74 + RR F4,E,A,B,C,D,76 + RR F4,C,D,E,A,B,78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + RESTORE_RENAMED_REGS + cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end + jne 1b +.endm + +.macro INIT_REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set T1, REG_T1 + .set T2, REG_T2 +.endm + +.macro RESTORE_RENAMED_REGS + # order is important (REG_C is where it should be) + mov B, REG_B + mov D, REG_D + mov A, REG_A + mov E, REG_E +.endm + +.macro SWAP_REG_NAMES a, b + .set _T, \a + .set \a, \b + .set \b, _T +.endm + +.macro F1 b, c, d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + xor \d, T1 + and \b, T1 + xor \d, T1 +.endm + +.macro F2 b, c, d + mov \d, T1 + SWAP_REG_NAMES \d, T1 + xor \c, T1 + xor \b, T1 +.endm + +.macro F3 b, c ,d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + mov \b, T2 + or \b, T1 + and \c, T2 + and \d, T1 + or T2, T1 +.endm + +.macro F4 b, c, d + F2 \b, \c, \d +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +/* + * RR does two rounds of SHA-1 back to back with W[] pre-calc + * t1 = F(b, c, d); e += w(i) + * e += t1; b <<= 30; d += w(i+1); + * t1 = F(a, b, c); + * d += t1; a <<= 5; + * e += a; + * t1 = e; a >>= 7; + * t1 <<= 5; + * d += t1; + */ +.macro RR F, a, b, c, d, e, round + add WK(\round), \e + \F \b, \c, \d # t1 = F(b, c, d); + W_PRECALC (\round + W_PRECALC_AHEAD) + rol $30, \b + add T1, \e + add WK(\round + 1), \d + + \F \a, \b, \c + W_PRECALC (\round + W_PRECALC_AHEAD + 1) + rol $5, \a + add \a, \e + add T1, \d + ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) + .set i, ((\r) % 80) # pre-compute for the next iteration + .if (i == 0) + W_PRECALC_RESET + .endif + W_PRECALC_00_15 + .elseif (i<32) + W_PRECALC_16_31 + .elseif (i < 80) // rounds 32-79 + W_PRECALC_32_79 + .endif +.endm + +.macro W_PRECALC_RESET + .set W, W0 + .set W_minus_04, W4 + .set W_minus_08, W8 + .set W_minus_12, W12 + .set W_minus_16, W16 + .set W_minus_20, W20 + .set W_minus_24, W24 + .set W_minus_28, W28 + .set W_minus_32, W +.endm + +.macro W_PRECALC_ROTATE + .set W_minus_32, W_minus_28 + .set W_minus_28, W_minus_24 + .set W_minus_24, W_minus_20 + .set W_minus_20, W_minus_16 + .set W_minus_16, W_minus_12 + .set W_minus_12, W_minus_08 + .set W_minus_08, W_minus_04 + .set W_minus_04, W + .set W, W_minus_32 +.endm + +.macro W_PRECALC_SSSE3 + +.macro W_PRECALC_00_15 + W_PRECALC_00_15_SSSE3 +.endm +.macro W_PRECALC_16_31 + W_PRECALC_16_31_SSSE3 +.endm +.macro W_PRECALC_32_79 + W_PRECALC_32_79_SSSE3 +.endm + +/* message scheduling pre-compute for rounds 0-15 */ +.macro W_PRECALC_00_15_SSSE3 + .if ((i & 3) == 0) + movdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + pshufb XMM_SHUFB_BSWAP, W_TMP1 + movdqa W_TMP1, W + .elseif ((i & 3) == 2) + paddd (K_BASE), W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 16-31 + * + * - calculating last 32 w[i] values in 8 XMM registers + * - pre-calculate K+w[i] values and store to mem, for later load by ALU add + * instruction + * + * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] + * dependency, but improves for 32-79 + */ +.macro W_PRECALC_16_31_SSSE3 + # blended scheduling of vector and scalar instruction streams, one 4-wide + # vector iteration / 4 scalar rounds + .if ((i & 3) == 0) + movdqa W_minus_12, W + palignr $8, W_minus_16, W # w[i-14] + movdqa W_minus_04, W_TMP1 + psrldq $4, W_TMP1 # w[i-3] + pxor W_minus_08, W + .elseif ((i & 3) == 1) + pxor W_minus_16, W_TMP1 + pxor W_TMP1, W + movdqa W, W_TMP2 + movdqa W, W_TMP1 + pslldq $12, W_TMP2 + .elseif ((i & 3) == 2) + psrld $31, W + pslld $1, W_TMP1 + por W, W_TMP1 + movdqa W_TMP2, W + psrld $30, W_TMP2 + pslld $2, W + .elseif ((i & 3) == 3) + pxor W, W_TMP1 + pxor W_TMP2, W_TMP1 + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 32-79 + * + * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + */ +.macro W_PRECALC_32_79_SSSE3 + .if ((i & 3) == 0) + movdqa W_minus_04, W_TMP1 + pxor W_minus_28, W # W is W_minus_32 before xor + palignr $8, W_minus_08, W_TMP1 + .elseif ((i & 3) == 1) + pxor W_minus_16, W + pxor W_TMP1, W + movdqa W, W_TMP1 + .elseif ((i & 3) == 2) + psrld $30, W + pslld $2, W_TMP1 + por W, W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_SSSE3 + + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.section .rodata +.align 16 + +K_XMM_AR: + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + +.section .text + +W_PRECALC_SSSE3 +.macro xmm_mov a, b + movdqu \a,\b +.endm + +/* + * SSSE3 optimized implementation: + * + * void sha1_transform_ssse3(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SHA1_VECTOR_ASM sha1_transform_ssse3 + +.macro W_PRECALC_AVX + +.purgem W_PRECALC_00_15 +.macro W_PRECALC_00_15 + W_PRECALC_00_15_AVX +.endm +.purgem W_PRECALC_16_31 +.macro W_PRECALC_16_31 + W_PRECALC_16_31_AVX +.endm +.purgem W_PRECALC_32_79 +.macro W_PRECALC_32_79 + W_PRECALC_32_79_AVX +.endm + +.macro W_PRECALC_00_15_AVX + .if ((i & 3) == 0) + vmovdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + vpshufb XMM_SHUFB_BSWAP, W_TMP1, W + .elseif ((i & 3) == 2) + vpaddd (K_BASE), W, W_TMP1 + .elseif ((i & 3) == 3) + vmovdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_16_31_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] + vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] + vpxor W_minus_08, W, W + vpxor W_minus_16, W_TMP1, W_TMP1 + .elseif ((i & 3) == 1) + vpxor W_TMP1, W, W + vpslldq $12, W, W_TMP2 + vpslld $1, W, W_TMP1 + .elseif ((i & 3) == 2) + vpsrld $31, W, W + vpor W, W_TMP1, W_TMP1 + vpslld $2, W_TMP2, W + vpsrld $30, W_TMP2, W_TMP2 + .elseif ((i & 3) == 3) + vpxor W, W_TMP1, W_TMP1 + vpxor W_TMP2, W_TMP1, W + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_32_79_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_08, W_minus_04, W_TMP1 + vpxor W_minus_28, W, W # W is W_minus_32 before xor + .elseif ((i & 3) == 1) + vpxor W_minus_16, W_TMP1, W_TMP1 + vpxor W_TMP1, W, W + .elseif ((i & 3) == 2) + vpslld $2, W, W_TMP1 + vpsrld $30, W, W + vpor W, W_TMP1, W + .elseif ((i & 3) == 3) + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_AVX + +W_PRECALC_AVX +.purgem xmm_mov +.macro xmm_mov a, b + vmovdqu \a,\b +.endm + + +/* AVX optimized implementation: + * void sha1_transform_avx(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +SHA1_VECTOR_ASM sha1_transform_avx diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h new file mode 100644 index 000000000000..e308379d89bc --- /dev/null +++ b/lib/crypto/x86/sha1.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include +#include + +DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic); + +#define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha1_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha1_block_state *state, \ + const u8 *data, size_t nblocks) \ + { \ + if (likely(irq_fpu_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha1_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3); +DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx); +DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform); + +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +static void sha1_blocks_avx2(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (likely(irq_fpu_usable())) { + kernel_fpu_begin(); + /* Select the optimal transform based on the number of blocks */ + if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(state, data, nblocks); + else + sha1_transform_avx(state, data, nblocks); + kernel_fpu_end(); + } else { + sha1_blocks_generic(state, data, nblocks); + } +} + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha1_blocks_x86)(state, data, nblocks); +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha1_blocks_x86, sha1_blocks_ni); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha1_blocks_x86, sha1_blocks_avx2); + else + static_call_update(sha1_blocks_x86, sha1_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha1_blocks_x86, sha1_blocks_ssse3); + } +}