From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 12 Jul 2025 23:23:04 +0000 (-0700)
Subject: lib/crypto: x86/sha1: Migrate optimized code into library
X-Git-Tag: ceph-for-6.17-rc6~328^2~4
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f3d6cb3dc0394b866bc0d1e15157ce45844cf3d3;p=ceph-client.git

lib/crypto: x86/sha1: Migrate optimized code into library

Instead of exposing the x86-optimized SHA-1 code via x86-specific
crypto_shash algorithms, instead just implement the sha1_blocks()
library function.  This is much simpler, it makes the SHA-1 library
functions be x86-optimized, and it fixes the longstanding issue where
the x86-optimized SHA-1 code was disabled by default.  SHA-1 still
remains available through crypto_shash, but individual architectures no
longer need to handle it.

To match sha1_blocks(), change the type of the nblocks parameter of the
assembly functions from int to size_t.  The assembly functions actually
already treated it as size_t.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-14-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---

diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index eb641a300154..94016c60561e 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -376,20 +376,6 @@ config CRYPTO_POLYVAL_CLMUL_NI
 	  Architecture: x86_64 using:
 	  - CLMUL-NI (carry-less multiplication new instructions)
 
-config CRYPTO_SHA1_SSSE3
-	tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
-	depends on 64BIT
-	select CRYPTO_SHA1
-	select CRYPTO_HASH
-	help
-	  SHA-1 secure hash algorithm (FIPS 180)
-
-	  Architecture: x86_64 using:
-	  - SSSE3 (Supplemental SSE3)
-	  - AVX (Advanced Vector Extensions)
-	  - AVX2 (Advanced Vector Extensions 2)
-	  - SHA-NI (SHA Extensions New Instructions)
-
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
 	depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index d31348be8370..d402963d6b57 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -51,9 +51,6 @@ ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
 aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
 endif
 
-obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
-
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
deleted file mode 100644
index 4b49bdc95265..000000000000
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Ilya Albrekht <ilya.albrekht@intel.com>
- * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- * Ronen Zohar <ronen.zohar@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
- *
- *This implementation is based on the previous SSSE3 release:
- *Visit http://software.intel.com/en-us/articles/
- *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
- *even number of 'blocks' consecutive 64-byte blocks.
- *
- *extern "C" void sha1_transform_avx2(
- *	struct sha1_state *state, const u8* input, int blocks );
- */
-
-#include <linux/linkage.h>
-
-#define	CTX	%rdi	/* arg1 */
-#define BUF	%rsi	/* arg2 */
-#define CNT	%rdx	/* arg3 */
-
-#define	REG_A	%ecx
-#define	REG_B	%esi
-#define	REG_C	%edi
-#define	REG_D	%eax
-#define	REG_E	%edx
-#define	REG_TB	%ebx
-#define	REG_TA	%r12d
-#define	REG_RA	%rcx
-#define	REG_RB	%rsi
-#define	REG_RC	%rdi
-#define	REG_RD	%rax
-#define	REG_RE	%rdx
-#define	REG_RTA	%r12
-#define	REG_RTB	%rbx
-#define	REG_T1	%r11d
-#define	xmm_mov	vmovups
-#define	avx2_zeroupper	vzeroupper
-#define	RND_F1	1
-#define	RND_F2	2
-#define	RND_F3	3
-
-.macro REGALLOC
-	.set A, REG_A
-	.set B, REG_B
-	.set C, REG_C
-	.set D, REG_D
-	.set E, REG_E
-	.set TB, REG_TB
-	.set TA, REG_TA
-
-	.set RA, REG_RA
-	.set RB, REG_RB
-	.set RC, REG_RC
-	.set RD, REG_RD
-	.set RE, REG_RE
-
-	.set RTA, REG_RTA
-	.set RTB, REG_RTB
-
-	.set T1, REG_T1
-.endm
-
-#define HASH_PTR	%r9
-#define BLOCKS_CTR	%r8
-#define BUFFER_PTR	%r10
-#define BUFFER_PTR2	%r13
-
-#define PRECALC_BUF	%r14
-#define WK_BUF		%r15
-
-#define W_TMP		%xmm0
-#define WY_TMP		%ymm0
-#define WY_TMP2		%ymm9
-
-# AVX2 variables
-#define WY0		%ymm3
-#define WY4		%ymm5
-#define WY08		%ymm7
-#define WY12		%ymm8
-#define WY16		%ymm12
-#define WY20		%ymm13
-#define WY24		%ymm14
-#define WY28		%ymm15
-
-#define YMM_SHUFB_BSWAP	%ymm10
-
-/*
- * Keep 2 iterations precalculated at a time:
- *    - 80 DWORDs per iteration * 2
- */
-#define W_SIZE		(80*2*2 +16)
-
-#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
-#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
-
-
-.macro UPDATE_HASH  hash, val
-	add	\hash, \val
-	mov	\val, \hash
-.endm
-
-.macro PRECALC_RESET_WY
-	.set WY_00, WY0
-	.set WY_04, WY4
-	.set WY_08, WY08
-	.set WY_12, WY12
-	.set WY_16, WY16
-	.set WY_20, WY20
-	.set WY_24, WY24
-	.set WY_28, WY28
-	.set WY_32, WY_00
-.endm
-
-.macro PRECALC_ROTATE_WY
-	/* Rotate macros */
-	.set WY_32, WY_28
-	.set WY_28, WY_24
-	.set WY_24, WY_20
-	.set WY_20, WY_16
-	.set WY_16, WY_12
-	.set WY_12, WY_08
-	.set WY_08, WY_04
-	.set WY_04, WY_00
-	.set WY_00, WY_32
-
-	/* Define register aliases */
-	.set WY, WY_00
-	.set WY_minus_04, WY_04
-	.set WY_minus_08, WY_08
-	.set WY_minus_12, WY_12
-	.set WY_minus_16, WY_16
-	.set WY_minus_20, WY_20
-	.set WY_minus_24, WY_24
-	.set WY_minus_28, WY_28
-	.set WY_minus_32, WY
-.endm
-
-.macro PRECALC_00_15
-	.if (i == 0) # Initialize and rotate registers
-		PRECALC_RESET_WY
-		PRECALC_ROTATE_WY
-	.endif
-
-	/* message scheduling pre-compute for rounds 0-15 */
-	.if   ((i & 7) == 0)
-		/*
-		 * blended AVX2 and ALU instruction scheduling
-		 * 1 vector iteration per 8 rounds
-		 */
-		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
-	.elseif ((i & 7) == 1)
-		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
-			 WY_TMP, WY_TMP
-	.elseif ((i & 7) == 2)
-		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
-	.elseif ((i & 7) == 4)
-		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
-	.elseif ((i & 7) == 7)
-		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
-
-		PRECALC_ROTATE_WY
-	.endif
-.endm
-
-.macro PRECALC_16_31
-	/*
-	 * message scheduling pre-compute for rounds 16-31
-	 * calculating last 32 w[i] values in 8 XMM registers
-	 * pre-calculate K+w[i] values and store to mem
-	 * for later load by ALU add instruction
-	 *
-	 * "brute force" vectorization for rounds 16-31 only
-	 * due to w[i]->w[i-3] dependency
-	 */
-	.if   ((i & 7) == 0)
-		/*
-		 * blended AVX2 and ALU instruction scheduling
-		 * 1 vector iteration per 8 rounds
-		 */
-		/* w[i-14] */
-		vpalignr	$8, WY_minus_16, WY_minus_12, WY
-		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
-	.elseif ((i & 7) == 1)
-		vpxor	WY_minus_08, WY, WY
-		vpxor	WY_minus_16, WY_TMP, WY_TMP
-	.elseif ((i & 7) == 2)
-		vpxor	WY_TMP, WY, WY
-		vpslldq	$12, WY, WY_TMP2
-	.elseif ((i & 7) == 3)
-		vpslld	$1, WY, WY_TMP
-		vpsrld	$31, WY, WY
-	.elseif ((i & 7) == 4)
-		vpor	WY, WY_TMP, WY_TMP
-		vpslld	$2, WY_TMP2, WY
-	.elseif ((i & 7) == 5)
-		vpsrld	$30, WY_TMP2, WY_TMP2
-		vpxor	WY, WY_TMP, WY_TMP
-	.elseif ((i & 7) == 7)
-		vpxor	WY_TMP2, WY_TMP, WY
-		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
-		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
-
-		PRECALC_ROTATE_WY
-	.endif
-.endm
-
-.macro PRECALC_32_79
-	/*
-	 * in SHA-1 specification:
-	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
-	 * instead we do equal:
-	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
-	 * allows more efficient vectorization
-	 * since w[i]=>w[i-3] dependency is broken
-	 */
-
-	.if   ((i & 7) == 0)
-	/*
-	 * blended AVX2 and ALU instruction scheduling
-	 * 1 vector iteration per 8 rounds
-	 */
-		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
-	.elseif ((i & 7) == 1)
-		/* W is W_minus_32 before xor */
-		vpxor	WY_minus_28, WY, WY
-	.elseif ((i & 7) == 2)
-		vpxor	WY_minus_16, WY_TMP, WY_TMP
-	.elseif ((i & 7) == 3)
-		vpxor	WY_TMP, WY, WY
-	.elseif ((i & 7) == 4)
-		vpslld	$2, WY, WY_TMP
-	.elseif ((i & 7) == 5)
-		vpsrld	$30, WY, WY
-		vpor	WY, WY_TMP, WY
-	.elseif ((i & 7) == 7)
-		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
-		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
-
-		PRECALC_ROTATE_WY
-	.endif
-.endm
-
-.macro PRECALC r, s
-	.set i, \r
-
-	.if (i < 40)
-		.set K_XMM, 32*0
-	.elseif (i < 80)
-		.set K_XMM, 32*1
-	.elseif (i < 120)
-		.set K_XMM, 32*2
-	.else
-		.set K_XMM, 32*3
-	.endif
-
-	.if (i<32)
-		PRECALC_00_15	\s
-	.elseif (i<64)
-		PRECALC_16_31	\s
-	.elseif (i < 160)
-		PRECALC_32_79	\s
-	.endif
-.endm
-
-.macro ROTATE_STATE
-	.set T_REG, E
-	.set E, D
-	.set D, C
-	.set C, B
-	.set B, TB
-	.set TB, A
-	.set A, T_REG
-
-	.set T_REG, RE
-	.set RE, RD
-	.set RD, RC
-	.set RC, RB
-	.set RB, RTB
-	.set RTB, RA
-	.set RA, T_REG
-.endm
-
-/* Macro relies on saved ROUND_Fx */
-
-.macro RND_FUN f, r
-	.if (\f == RND_F1)
-		ROUND_F1	\r
-	.elseif (\f == RND_F2)
-		ROUND_F2	\r
-	.elseif (\f == RND_F3)
-		ROUND_F3	\r
-	.endif
-.endm
-
-.macro RR r
-	.set round_id, (\r % 80)
-
-	.if (round_id == 0)        /* Precalculate F for first round */
-		.set ROUND_FUNC, RND_F1
-		mov	B, TB
-
-		rorx	$(32-30), B, B    /* b>>>2 */
-		andn	D, TB, T1
-		and	C, TB
-		xor	T1, TB
-	.endif
-
-	RND_FUN ROUND_FUNC, \r
-	ROTATE_STATE
-
-	.if   (round_id == 18)
-		.set ROUND_FUNC, RND_F2
-	.elseif (round_id == 38)
-		.set ROUND_FUNC, RND_F3
-	.elseif (round_id == 58)
-		.set ROUND_FUNC, RND_F2
-	.endif
-
-	.set round_id, ( (\r+1) % 80)
-
-	RND_FUN ROUND_FUNC, (\r+1)
-	ROTATE_STATE
-.endm
-
-.macro ROUND_F1 r
-	add	WK(\r), E
-
-	andn	C, A, T1			/* ~b&d */
-	lea	(RE,RTB), E		/* Add F from the previous round */
-
-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	rorx	$(32-30),A, TB		/* b>>>2 for next round */
-
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
-
-	/*
-	 * Calculate F for the next round
-	 * (b & c) ^ andn[b, d]
-	 */
-	and	B, A			/* b&c */
-	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
-
-	lea	(RE,RTA), E		/* E += A >>> 5 */
-.endm
-
-.macro ROUND_F2 r
-	add	WK(\r), E
-	lea	(RE,RTB), E		/* Add F from the previous round */
-
-	/* Calculate F for the next round */
-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	.if ((round_id) < 79)
-		rorx	$(32-30), A, TB	/* b>>>2 for next round */
-	.endif
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
-
-	.if ((round_id) < 79)
-		xor	B, A
-	.endif
-
-	add	TA, E			/* E += A >>> 5 */
-
-	.if ((round_id) < 79)
-		xor	C, A
-	.endif
-.endm
-
-.macro ROUND_F3 r
-	add	WK(\r), E
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
-
-	lea	(RE,RTB), E		/* Add F from the previous round */
-
-	mov	B, T1
-	or	A, T1
-
-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	rorx	$(32-30), A, TB		/* b>>>2 for next round */
-
-	/* Calculate F for the next round
-	 * (b and c) or (d and (b or c))
-	 */
-	and	C, T1
-	and	B, A
-	or	T1, A
-
-	add	TA, E			/* E += A >>> 5 */
-
-.endm
-
-/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
- * %1 + %2 >= %3 ? %4 : 0
- */
-.macro ADD_IF_GE a, b, c, d
-	mov     \a, RTA
-	add     $\d, RTA
-	cmp     $\c, \b
-	cmovge  RTA, \a
-.endm
-
-/*
- * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
- */
-.macro SHA1_PIPELINED_MAIN_BODY
-
-	REGALLOC
-
-	mov	(HASH_PTR), A
-	mov	4(HASH_PTR), B
-	mov	8(HASH_PTR), C
-	mov	12(HASH_PTR), D
-	mov	16(HASH_PTR), E
-
-	mov	%rsp, PRECALC_BUF
-	lea	(2*4*80+32)(%rsp), WK_BUF
-
-	# Precalc WK for first 2 blocks
-	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
-	.set i, 0
-	.rept    160
-		PRECALC i
-		.set i, i + 1
-	.endr
-
-	/* Go to next block if needed */
-	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
-	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
-	xchg	WK_BUF, PRECALC_BUF
-
-	.align 32
-.L_loop:
-	/*
-	 * code loops through more than one block
-	 * we use K_BASE value as a signal of a last block,
-	 * it is set below by: cmovae BUFFER_PTR, K_BASE
-	 */
-	test BLOCKS_CTR, BLOCKS_CTR
-	jnz .L_begin
-	.align 32
-	jmp	.L_end
-	.align 32
-.L_begin:
-
-	/*
-	 * Do first block
-	 * rounds: 0,2,4,6,8
-	 */
-	.set j, 0
-	.rept 5
-		RR	j
-		.set j, j+2
-	.endr
-
-	/*
-	 * rounds:
-	 * 10,12,14,16,18
-	 * 20,22,24,26,28
-	 * 30,32,34,36,38
-	 * 40,42,44,46,48
-	 * 50,52,54,56,58
-	 */
-	.rept 25
-		RR	j
-		.set j, j+2
-	.endr
-
-	/* Update Counter */
-	sub $1, BLOCKS_CTR
-	/* Move to the next block only if needed*/
-	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
-	/*
-	 * rounds
-	 * 60,62,64,66,68
-	 * 70,72,74,76,78
-	 */
-	.rept 10
-		RR	j
-		.set j, j+2
-	.endr
-
-	UPDATE_HASH	(HASH_PTR), A
-	UPDATE_HASH	4(HASH_PTR), TB
-	UPDATE_HASH	8(HASH_PTR), C
-	UPDATE_HASH	12(HASH_PTR), D
-	UPDATE_HASH	16(HASH_PTR), E
-
-	test	BLOCKS_CTR, BLOCKS_CTR
-	jz	.L_loop
-
-	mov	TB, B
-
-	/* Process second block */
-	/*
-	 * rounds
-	 *  0+80, 2+80, 4+80, 6+80, 8+80
-	 * 10+80,12+80,14+80,16+80,18+80
-	 */
-
-	.set j, 0
-	.rept 10
-		RR	j+80
-		.set j, j+2
-	.endr
-
-	/*
-	 * rounds
-	 * 20+80,22+80,24+80,26+80,28+80
-	 * 30+80,32+80,34+80,36+80,38+80
-	 */
-	.rept 10
-		RR	j+80
-		.set j, j+2
-	.endr
-
-	/*
-	 * rounds
-	 * 40+80,42+80,44+80,46+80,48+80
-	 * 50+80,52+80,54+80,56+80,58+80
-	 */
-	.rept 10
-		RR	j+80
-		.set j, j+2
-	.endr
-
-	/* update counter */
-	sub     $1, BLOCKS_CTR
-	/* Move to the next block only if needed*/
-	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
-
-	/*
-	 * rounds
-	 * 60+80,62+80,64+80,66+80,68+80
-	 * 70+80,72+80,74+80,76+80,78+80
-	 */
-	.rept 10
-		RR	j+80
-		.set j, j+2
-	.endr
-
-	UPDATE_HASH	(HASH_PTR), A
-	UPDATE_HASH	4(HASH_PTR), TB
-	UPDATE_HASH	8(HASH_PTR), C
-	UPDATE_HASH	12(HASH_PTR), D
-	UPDATE_HASH	16(HASH_PTR), E
-
-	/* Reset state for AVX2 reg permutation */
-	mov	A, TA
-	mov	TB, A
-	mov	C, TB
-	mov	E, C
-	mov	D, B
-	mov	TA, D
-
-	REGALLOC
-
-	xchg	WK_BUF, PRECALC_BUF
-
-	jmp	.L_loop
-
-	.align 32
-.L_end:
-
-.endm
-/*
- * macro implements SHA-1 function's body for several 64-byte blocks
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM  name
-	SYM_FUNC_START(\name)
-
-	push	%rbx
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-
-	RESERVE_STACK  = (W_SIZE*4 + 8+24)
-
-	/* Align stack */
-	push	%rbp
-	mov	%rsp, %rbp
-	and	$~(0x20-1), %rsp
-	sub	$RESERVE_STACK, %rsp
-
-	avx2_zeroupper
-
-	/* Setup initial values */
-	mov	CTX, HASH_PTR
-	mov	BUF, BUFFER_PTR
-
-	mov	BUF, BUFFER_PTR2
-	mov	CNT, BLOCKS_CTR
-
-	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
-
-	SHA1_PIPELINED_MAIN_BODY
-
-	avx2_zeroupper
-
-	mov	%rbp, %rsp
-	pop	%rbp
-
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbx
-
-	RET
-
-	SYM_FUNC_END(\name)
-.endm
-
-.section .rodata
-
-#define K1 0x5a827999
-#define K2 0x6ed9eba1
-#define K3 0x8f1bbcdc
-#define K4 0xca62c1d6
-
-.align 128
-K_XMM_AR:
-	.long K1, K1, K1, K1
-	.long K1, K1, K1, K1
-	.long K2, K2, K2, K2
-	.long K2, K2, K2, K2
-	.long K3, K3, K3, K3
-	.long K3, K3, K3, K3
-	.long K4, K4, K4, K4
-	.long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
-	.long 0x00010203
-	.long 0x04050607
-	.long 0x08090a0b
-	.long 0x0c0d0e0f
-	.long 0x00010203
-	.long 0x04050607
-	.long 0x08090a0b
-	.long 0x0c0d0e0f
-.text
-
-SHA1_VECTOR_ASM     sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
deleted file mode 100644
index cade913d4882..000000000000
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * 	Sean Gulley <sean.m.gulley@intel.com>
- * 	Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 	* Redistributions of source code must retain the above copyright
- * 	  notice, this list of conditions and the following disclaimer.
- * 	* Redistributions in binary form must reproduce the above copyright
- * 	  notice, this list of conditions and the following disclaimer in
- * 	  the documentation and/or other materials provided with the
- * 	  distribution.
- * 	* Neither the name of Intel Corporation nor the names of its
- * 	  contributors may be used to endorse or promote products derived
- * 	  from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define DIGEST_PTR	%rdi	/* 1st arg */
-#define DATA_PTR	%rsi	/* 2nd arg */
-#define NUM_BLKS	%rdx	/* 3rd arg */
-
-/* gcc conversion */
-#define FRAME_SIZE	32	/* space for 2x16 bytes */
-
-#define ABCD		%xmm0
-#define E0		%xmm1	/* Need two E's b/c they ping pong */
-#define E1		%xmm2
-#define MSG0		%xmm3
-#define MSG1		%xmm4
-#define MSG2		%xmm5
-#define MSG3		%xmm6
-#define SHUF_MASK	%xmm7
-
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-1 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process.  Once all blocks have
- * been processed, the digest pointer is  updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks. All message padding and hash value initialization must
- * be done outside the update function.
- *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
- * void sha1_ni_transform(uint32_t *digest, const void *data,
-		uint32_t numBlocks)
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-.text
-SYM_TYPED_FUNC_START(sha1_ni_transform)
-	push		%rbp
-	mov		%rsp, %rbp
-	sub		$FRAME_SIZE, %rsp
-	and		$~0xF, %rsp
-
-	shl		$6, NUM_BLKS		/* convert to bytes */
-	jz		.Ldone_hash
-	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
-
-	/* load initial hash values */
-	pinsrd		$3, 1*16(DIGEST_PTR), E0
-	movdqu		0*16(DIGEST_PTR), ABCD
-	pand		UPPER_WORD_MASK(%rip), E0
-	pshufd		$0x1B, ABCD, ABCD
-
-	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-
-.Lloop0:
-	/* Save hash values for addition after rounds */
-	movdqa		E0, (0*16)(%rsp)
-	movdqa		ABCD, (1*16)(%rsp)
-
-	/* Rounds 0-3 */
-	movdqu		0*16(DATA_PTR), MSG0
-	pshufb		SHUF_MASK, MSG0
-		paddd		MSG0, E0
-		movdqa		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-
-	/* Rounds 4-7 */
-	movdqu		1*16(DATA_PTR), MSG1
-	pshufb		SHUF_MASK, MSG1
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG1, MSG0
-
-	/* Rounds 8-11 */
-	movdqu		2*16(DATA_PTR), MSG2
-	pshufb		SHUF_MASK, MSG2
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 12-15 */
-	movdqu		3*16(DATA_PTR), MSG3
-	pshufb		SHUF_MASK, MSG3
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 16-19 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 20-23 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	pxor		MSG1, MSG3
-
-	/* Rounds 24-27 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 28-31 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 32-35 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 36-39 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	pxor		MSG1, MSG3
-
-	/* Rounds 40-43 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 44-47 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 48-51 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 52-55 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	pxor		MSG1, MSG3
-
-	/* Rounds 56-59 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	pxor		MSG2, MSG0
-
-	/* Rounds 60-63 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$3, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	pxor		MSG3, MSG1
-
-	/* Rounds 64-67 */
-		sha1nexte	MSG0, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$3, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	pxor		MSG0, MSG2
-
-	/* Rounds 68-71 */
-		sha1nexte	MSG1, E1
-		movdqa		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$3, E1, ABCD
-	pxor		MSG1, MSG3
-
-	/* Rounds 72-75 */
-		sha1nexte	MSG2, E0
-		movdqa		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$3, E0, ABCD
-
-	/* Rounds 76-79 */
-		sha1nexte	MSG3, E1
-		movdqa		ABCD, E0
-		sha1rnds4	$3, E1, ABCD
-
-	/* Add current hash values with previously saved */
-	sha1nexte	(0*16)(%rsp), E0
-	paddd		(1*16)(%rsp), ABCD
-
-	/* Increment data pointer and loop if more to process */
-	add		$64, DATA_PTR
-	cmp		NUM_BLKS, DATA_PTR
-	jne		.Lloop0
-
-	/* Write hash values back in the correct order */
-	pshufd		$0x1B, ABCD, ABCD
-	movdqu		ABCD, 0*16(DIGEST_PTR)
-	pextrd		$3, E0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
-	mov		%rbp, %rsp
-	pop		%rbp
-
-	RET
-SYM_FUNC_END(sha1_ni_transform)
-
-.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
-	.octa 0x000102030405060708090a0b0c0d0e0f
-
-.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
-.align 16
-UPPER_WORD_MASK:
-	.octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
deleted file mode 100644
index f54988c80eb4..000000000000
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ /dev/null
@@ -1,554 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
- * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
- * processors. CPUs supporting Intel(R) AVX extensions will get an additional
- * boost.
- *
- * This work was inspired by the vectorized implementation of Dean Gaudet.
- * Additional information on it can be found at:
- *    http://www.arctic.org/~dean/crypto/sha1.html
- *
- * It was improved upon with more efficient vectorization of the message
- * scheduling. This implementation has also been optimized for all current and
- * several future generations of Intel CPUs.
- *
- * See this article for more information about the implementation details:
- *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- * Copyright (C) 2010, Intel Corp.
- *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
- *            Ronen Zohar <ronen.zohar@intel.com>
- *
- * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
- *   Author: Mathias Krause <minipli@googlemail.com>
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define CTX	%rdi	// arg1
-#define BUF	%rsi	// arg2
-#define CNT	%rdx	// arg3
-
-#define REG_A	%ecx
-#define REG_B	%esi
-#define REG_C	%edi
-#define REG_D	%r12d
-#define REG_E	%edx
-
-#define REG_T1	%eax
-#define REG_T2	%ebx
-
-#define K_BASE		%r8
-#define HASH_PTR	%r9
-#define BUFFER_PTR	%r10
-#define BUFFER_END	%r11
-
-#define W_TMP1	%xmm0
-#define W_TMP2	%xmm9
-
-#define W0	%xmm1
-#define W4	%xmm2
-#define W8	%xmm3
-#define W12	%xmm4
-#define W16	%xmm5
-#define W20	%xmm6
-#define W24	%xmm7
-#define W28	%xmm8
-
-#define XMM_SHUFB_BSWAP	%xmm10
-
-/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
-#define WK(t)	(((t) & 15) * 4)(%rsp)
-#define W_PRECALC_AHEAD	16
-
-/*
- * This macro implements the SHA-1 function's body for single 64-byte block
- * param: function's name
- */
-.macro SHA1_VECTOR_ASM  name
-	SYM_TYPED_FUNC_START(\name)
-
-	push	%rbx
-	push	%r12
-	push	%rbp
-	mov	%rsp, %rbp
-
-	sub	$64, %rsp		# allocate workspace
-	and	$~15, %rsp		# align stack
-
-	mov	CTX, HASH_PTR
-	mov	BUF, BUFFER_PTR
-
-	shl	$6, CNT			# multiply by 64
-	add	BUF, CNT
-	mov	CNT, BUFFER_END
-
-	lea	K_XMM_AR(%rip), K_BASE
-	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
-
-	SHA1_PIPELINED_MAIN_BODY
-
-	# cleanup workspace
-	mov	$8, %ecx
-	mov	%rsp, %rdi
-	xor	%eax, %eax
-	rep stosq
-
-	mov	%rbp, %rsp		# deallocate workspace
-	pop	%rbp
-	pop	%r12
-	pop	%rbx
-	RET
-
-	SYM_FUNC_END(\name)
-.endm
-
-/*
- * This macro implements 80 rounds of SHA-1 for one 64-byte block
- */
-.macro SHA1_PIPELINED_MAIN_BODY
-	INIT_REGALLOC
-
-	mov	  (HASH_PTR), A
-	mov	 4(HASH_PTR), B
-	mov	 8(HASH_PTR), C
-	mov	12(HASH_PTR), D
-	mov	16(HASH_PTR), E
-
-  .set i, 0
-  .rept W_PRECALC_AHEAD
-	W_PRECALC i
-    .set i, (i+1)
-  .endr
-
-.align 4
-1:
-	RR F1,A,B,C,D,E,0
-	RR F1,D,E,A,B,C,2
-	RR F1,B,C,D,E,A,4
-	RR F1,E,A,B,C,D,6
-	RR F1,C,D,E,A,B,8
-
-	RR F1,A,B,C,D,E,10
-	RR F1,D,E,A,B,C,12
-	RR F1,B,C,D,E,A,14
-	RR F1,E,A,B,C,D,16
-	RR F1,C,D,E,A,B,18
-
-	RR F2,A,B,C,D,E,20
-	RR F2,D,E,A,B,C,22
-	RR F2,B,C,D,E,A,24
-	RR F2,E,A,B,C,D,26
-	RR F2,C,D,E,A,B,28
-
-	RR F2,A,B,C,D,E,30
-	RR F2,D,E,A,B,C,32
-	RR F2,B,C,D,E,A,34
-	RR F2,E,A,B,C,D,36
-	RR F2,C,D,E,A,B,38
-
-	RR F3,A,B,C,D,E,40
-	RR F3,D,E,A,B,C,42
-	RR F3,B,C,D,E,A,44
-	RR F3,E,A,B,C,D,46
-	RR F3,C,D,E,A,B,48
-
-	RR F3,A,B,C,D,E,50
-	RR F3,D,E,A,B,C,52
-	RR F3,B,C,D,E,A,54
-	RR F3,E,A,B,C,D,56
-	RR F3,C,D,E,A,B,58
-
-	add	$64, BUFFER_PTR		# move to the next 64-byte block
-	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
-	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
-
-	RR F4,A,B,C,D,E,60
-	RR F4,D,E,A,B,C,62
-	RR F4,B,C,D,E,A,64
-	RR F4,E,A,B,C,D,66
-	RR F4,C,D,E,A,B,68
-
-	RR F4,A,B,C,D,E,70
-	RR F4,D,E,A,B,C,72
-	RR F4,B,C,D,E,A,74
-	RR F4,E,A,B,C,D,76
-	RR F4,C,D,E,A,B,78
-
-	UPDATE_HASH   (HASH_PTR), A
-	UPDATE_HASH  4(HASH_PTR), B
-	UPDATE_HASH  8(HASH_PTR), C
-	UPDATE_HASH 12(HASH_PTR), D
-	UPDATE_HASH 16(HASH_PTR), E
-
-	RESTORE_RENAMED_REGS
-	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
-	jne	1b
-.endm
-
-.macro INIT_REGALLOC
-  .set A, REG_A
-  .set B, REG_B
-  .set C, REG_C
-  .set D, REG_D
-  .set E, REG_E
-  .set T1, REG_T1
-  .set T2, REG_T2
-.endm
-
-.macro RESTORE_RENAMED_REGS
-	# order is important (REG_C is where it should be)
-	mov	B, REG_B
-	mov	D, REG_D
-	mov	A, REG_A
-	mov	E, REG_E
-.endm
-
-.macro SWAP_REG_NAMES  a, b
-  .set _T, \a
-  .set \a, \b
-  .set \b, _T
-.endm
-
-.macro F1  b, c, d
-	mov	\c, T1
-	SWAP_REG_NAMES \c, T1
-	xor	\d, T1
-	and	\b, T1
-	xor	\d, T1
-.endm
-
-.macro F2  b, c, d
-	mov	\d, T1
-	SWAP_REG_NAMES \d, T1
-	xor	\c, T1
-	xor	\b, T1
-.endm
-
-.macro F3  b, c ,d
-	mov	\c, T1
-	SWAP_REG_NAMES \c, T1
-	mov	\b, T2
-	or	\b, T1
-	and	\c, T2
-	and	\d, T1
-	or	T2, T1
-.endm
-
-.macro F4  b, c, d
-	F2 \b, \c, \d
-.endm
-
-.macro UPDATE_HASH  hash, val
-	add	\hash, \val
-	mov	\val, \hash
-.endm
-
-/*
- * RR does two rounds of SHA-1 back to back with W[] pre-calc
- *   t1 = F(b, c, d);   e += w(i)
- *   e += t1;           b <<= 30;   d  += w(i+1);
- *   t1 = F(a, b, c);
- *   d += t1;           a <<= 5;
- *   e += a;
- *   t1 = e;            a >>= 7;
- *   t1 <<= 5;
- *   d += t1;
- */
-.macro RR  F, a, b, c, d, e, round
-	add	WK(\round), \e
-	\F   \b, \c, \d		# t1 = F(b, c, d);
-	W_PRECALC (\round + W_PRECALC_AHEAD)
-	rol	$30, \b
-	add	T1, \e
-	add	WK(\round + 1), \d
-
-	\F   \a, \b, \c
-	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
-	rol	$5, \a
-	add	\a, \e
-	add	T1, \d
-	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
-
-	mov	\e, T1
-	SWAP_REG_NAMES \e, T1
-
-	rol	$5, T1
-	add	T1, \d
-
-	# write:  \a, \b
-	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
-.endm
-
-.macro W_PRECALC  r
-  .set i, \r
-
-  .if (i < 20)
-    .set K_XMM, 0
-  .elseif (i < 40)
-    .set K_XMM, 16
-  .elseif (i < 60)
-    .set K_XMM, 32
-  .elseif (i < 80)
-    .set K_XMM, 48
-  .endif
-
-  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
-    .set i, ((\r) % 80)	    # pre-compute for the next iteration
-    .if (i == 0)
-	W_PRECALC_RESET
-    .endif
-	W_PRECALC_00_15
-  .elseif (i<32)
-	W_PRECALC_16_31
-  .elseif (i < 80)   // rounds 32-79
-	W_PRECALC_32_79
-  .endif
-.endm
-
-.macro W_PRECALC_RESET
-  .set W,          W0
-  .set W_minus_04, W4
-  .set W_minus_08, W8
-  .set W_minus_12, W12
-  .set W_minus_16, W16
-  .set W_minus_20, W20
-  .set W_minus_24, W24
-  .set W_minus_28, W28
-  .set W_minus_32, W
-.endm
-
-.macro W_PRECALC_ROTATE
-  .set W_minus_32, W_minus_28
-  .set W_minus_28, W_minus_24
-  .set W_minus_24, W_minus_20
-  .set W_minus_20, W_minus_16
-  .set W_minus_16, W_minus_12
-  .set W_minus_12, W_minus_08
-  .set W_minus_08, W_minus_04
-  .set W_minus_04, W
-  .set W,          W_minus_32
-.endm
-
-.macro W_PRECALC_SSSE3
-
-.macro W_PRECALC_00_15
-	W_PRECALC_00_15_SSSE3
-.endm
-.macro W_PRECALC_16_31
-	W_PRECALC_16_31_SSSE3
-.endm
-.macro W_PRECALC_32_79
-	W_PRECALC_32_79_SSSE3
-.endm
-
-/* message scheduling pre-compute for rounds 0-15 */
-.macro W_PRECALC_00_15_SSSE3
-  .if ((i & 3) == 0)
-	movdqu	(i*4)(BUFFER_PTR), W_TMP1
-  .elseif ((i & 3) == 1)
-	pshufb	XMM_SHUFB_BSWAP, W_TMP1
-	movdqa	W_TMP1, W
-  .elseif ((i & 3) == 2)
-	paddd	(K_BASE), W_TMP1
-  .elseif ((i & 3) == 3)
-	movdqa  W_TMP1, WK(i&~3)
-	W_PRECALC_ROTATE
-  .endif
-.endm
-
-/* message scheduling pre-compute for rounds 16-31
- *
- * - calculating last 32 w[i] values in 8 XMM registers
- * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
- *   instruction
- *
- * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
- * dependency, but improves for 32-79
- */
-.macro W_PRECALC_16_31_SSSE3
-  # blended scheduling of vector and scalar instruction streams, one 4-wide
-  # vector iteration / 4 scalar rounds
-  .if ((i & 3) == 0)
-	movdqa	W_minus_12, W
-	palignr	$8, W_minus_16, W	# w[i-14]
-	movdqa	W_minus_04, W_TMP1
-	psrldq	$4, W_TMP1		# w[i-3]
-	pxor	W_minus_08, W
-  .elseif ((i & 3) == 1)
-	pxor	W_minus_16, W_TMP1
-	pxor	W_TMP1, W
-	movdqa	W, W_TMP2
-	movdqa	W, W_TMP1
-	pslldq	$12, W_TMP2
-  .elseif ((i & 3) == 2)
-	psrld	$31, W
-	pslld	$1, W_TMP1
-	por	W, W_TMP1
-	movdqa	W_TMP2, W
-	psrld	$30, W_TMP2
-	pslld	$2, W
-  .elseif ((i & 3) == 3)
-	pxor	W, W_TMP1
-	pxor	W_TMP2, W_TMP1
-	movdqa	W_TMP1, W
-	paddd	K_XMM(K_BASE), W_TMP1
-	movdqa	W_TMP1, WK(i&~3)
-	W_PRECALC_ROTATE
-  .endif
-.endm
-
-/* message scheduling pre-compute for rounds 32-79
- *
- * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
- * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
- * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
- */
-.macro W_PRECALC_32_79_SSSE3
-  .if ((i & 3) == 0)
-	movdqa	W_minus_04, W_TMP1
-	pxor	W_minus_28, W		# W is W_minus_32 before xor
-	palignr	$8, W_minus_08, W_TMP1
-  .elseif ((i & 3) == 1)
-	pxor	W_minus_16, W
-	pxor	W_TMP1, W
-	movdqa	W, W_TMP1
-  .elseif ((i & 3) == 2)
-	psrld	$30, W
-	pslld	$2, W_TMP1
-	por	W, W_TMP1
-  .elseif ((i & 3) == 3)
-	movdqa	W_TMP1, W
-	paddd	K_XMM(K_BASE), W_TMP1
-	movdqa	W_TMP1, WK(i&~3)
-	W_PRECALC_ROTATE
-  .endif
-.endm
-
-.endm		// W_PRECALC_SSSE3
-
-
-#define K1	0x5a827999
-#define K2	0x6ed9eba1
-#define K3	0x8f1bbcdc
-#define K4	0xca62c1d6
-
-.section .rodata
-.align 16
-
-K_XMM_AR:
-	.long K1, K1, K1, K1
-	.long K2, K2, K2, K2
-	.long K3, K3, K3, K3
-	.long K4, K4, K4, K4
-
-BSWAP_SHUFB_CTL:
-	.long 0x00010203
-	.long 0x04050607
-	.long 0x08090a0b
-	.long 0x0c0d0e0f
-
-
-.section .text
-
-W_PRECALC_SSSE3
-.macro xmm_mov a, b
-	movdqu	\a,\b
-.endm
-
-/*
- * SSSE3 optimized implementation:
- *
- * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
- *					const u8 *data, int blocks);
- *
- * Note that struct sha1_state is assumed to begin with u32 state[5].
- */
-SHA1_VECTOR_ASM     sha1_transform_ssse3
-
-.macro W_PRECALC_AVX
-
-.purgem W_PRECALC_00_15
-.macro  W_PRECALC_00_15
-    W_PRECALC_00_15_AVX
-.endm
-.purgem W_PRECALC_16_31
-.macro  W_PRECALC_16_31
-    W_PRECALC_16_31_AVX
-.endm
-.purgem W_PRECALC_32_79
-.macro  W_PRECALC_32_79
-    W_PRECALC_32_79_AVX
-.endm
-
-.macro W_PRECALC_00_15_AVX
-  .if ((i & 3) == 0)
-	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
-  .elseif ((i & 3) == 1)
-	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
-  .elseif ((i & 3) == 2)
-	vpaddd	(K_BASE), W, W_TMP1
-  .elseif ((i & 3) == 3)
-	vmovdqa	W_TMP1, WK(i&~3)
-	W_PRECALC_ROTATE
-  .endif
-.endm
-
-.macro W_PRECALC_16_31_AVX
-  .if ((i & 3) == 0)
-	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
-	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
-	vpxor	W_minus_08, W, W
-	vpxor	W_minus_16, W_TMP1, W_TMP1
-  .elseif ((i & 3) == 1)
-	vpxor	W_TMP1, W, W
-	vpslldq	$12, W, W_TMP2
-	vpslld	$1, W, W_TMP1
-  .elseif ((i & 3) == 2)
-	vpsrld	$31, W, W
-	vpor	W, W_TMP1, W_TMP1
-	vpslld	$2, W_TMP2, W
-	vpsrld	$30, W_TMP2, W_TMP2
-  .elseif ((i & 3) == 3)
-	vpxor	W, W_TMP1, W_TMP1
-	vpxor	W_TMP2, W_TMP1, W
-	vpaddd	K_XMM(K_BASE), W, W_TMP1
-	vmovdqu	W_TMP1, WK(i&~3)
-	W_PRECALC_ROTATE
-  .endif
-.endm
-
-.macro W_PRECALC_32_79_AVX
-  .if ((i & 3) == 0)
-	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
-	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
-  .elseif ((i & 3) == 1)
-	vpxor	W_minus_16, W_TMP1, W_TMP1
-	vpxor	W_TMP1, W, W
-  .elseif ((i & 3) == 2)
-	vpslld	$2, W, W_TMP1
-	vpsrld	$30, W, W
-	vpor	W, W_TMP1, W
-  .elseif ((i & 3) == 3)
-	vpaddd	K_XMM(K_BASE), W, W_TMP1
-	vmovdqu	W_TMP1, WK(i&~3)
-	W_PRECALC_ROTATE
-  .endif
-.endm
-
-.endm    // W_PRECALC_AVX
-
-W_PRECALC_AVX
-.purgem xmm_mov
-.macro xmm_mov a, b
-	vmovdqu	\a,\b
-.endm
-
-
-/* AVX optimized implementation:
- *  extern "C" void sha1_transform_avx(struct sha1_state *state,
- *				       const u8 *data, int blocks);
- */
-SHA1_VECTOR_ASM     sha1_transform_avx
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
deleted file mode 100644
index 826579a7473c..000000000000
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ /dev/null
@@ -1,324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha1_generic.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static const struct x86_cpu_id module_cpu_ids[] = {
-	X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
-	X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
-	X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
-	X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static inline int sha1_update_x86(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, sha1_block_fn *sha1_xform)
-{
-	int remain;
-
-	/*
-	 * Make sure struct sha1_state begins directly with the SHA1
-	 * 160-bit internal state, as this is what the asm functions expect.
-	 */
-	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
-	kernel_fpu_begin();
-	remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
-	kernel_fpu_end();
-
-	return remain;
-}
-
-static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out,
-			     sha1_block_fn *sha1_xform)
-{
-	kernel_fpu_begin();
-	sha1_base_do_finup(desc, data, len, sha1_xform);
-	kernel_fpu_end();
-
-	return sha1_base_finish(desc, out);
-}
-
-asmlinkage void sha1_transform_ssse3(struct sha1_state *state,
-				     const u8 *data, int blocks);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	return sha1_update_x86(desc, data, len, sha1_transform_ssse3);
-}
-
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
-{
-	return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
-}
-
-static struct shash_alg sha1_ssse3_alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_base_init,
-	.update		=	sha1_ssse3_update,
-	.finup		=	sha1_ssse3_finup,
-	.descsize	=	SHA1_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha1",
-		.cra_driver_name =	"sha1-ssse3",
-		.cra_priority	=	150,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	SHA1_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int register_sha1_ssse3(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SSSE3))
-		return crypto_register_shash(&sha1_ssse3_alg);
-	return 0;
-}
-
-static void unregister_sha1_ssse3(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SSSE3))
-		crypto_unregister_shash(&sha1_ssse3_alg);
-}
-
-asmlinkage void sha1_transform_avx(struct sha1_state *state,
-				   const u8 *data, int blocks);
-
-static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	return sha1_update_x86(desc, data, len, sha1_transform_avx);
-}
-
-static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
-{
-	return sha1_finup(desc, data, len, out, sha1_transform_avx);
-}
-
-static struct shash_alg sha1_avx_alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_base_init,
-	.update		=	sha1_avx_update,
-	.finup		=	sha1_avx_finup,
-	.descsize	=	SHA1_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha1",
-		.cra_driver_name =	"sha1-avx",
-		.cra_priority	=	160,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	SHA1_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static bool avx_usable(void)
-{
-	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
-		if (boot_cpu_has(X86_FEATURE_AVX))
-			pr_info("AVX detected but unusable.\n");
-		return false;
-	}
-
-	return true;
-}
-
-static int register_sha1_avx(void)
-{
-	if (avx_usable())
-		return crypto_register_shash(&sha1_avx_alg);
-	return 0;
-}
-
-static void unregister_sha1_avx(void)
-{
-	if (avx_usable())
-		crypto_unregister_shash(&sha1_avx_alg);
-}
-
-#define SHA1_AVX2_BLOCK_OPTSIZE	4	/* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(struct sha1_state *state,
-				    const u8 *data, int blocks);
-
-static bool avx2_usable(void)
-{
-	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
-		&& boot_cpu_has(X86_FEATURE_BMI1)
-		&& boot_cpu_has(X86_FEATURE_BMI2))
-		return true;
-
-	return false;
-}
-
-static inline void sha1_apply_transform_avx2(struct sha1_state *state,
-					     const u8 *data, int blocks)
-{
-	/* Select the optimal transform based on data block size */
-	if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
-		sha1_transform_avx2(state, data, blocks);
-	else
-		sha1_transform_avx(state, data, blocks);
-}
-
-static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	return sha1_update_x86(desc, data, len, sha1_apply_transform_avx2);
-}
-
-static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
-{
-	return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
-}
-
-static struct shash_alg sha1_avx2_alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_base_init,
-	.update		=	sha1_avx2_update,
-	.finup		=	sha1_avx2_finup,
-	.descsize	=	SHA1_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha1",
-		.cra_driver_name =	"sha1-avx2",
-		.cra_priority	=	170,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	SHA1_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int register_sha1_avx2(void)
-{
-	if (avx2_usable())
-		return crypto_register_shash(&sha1_avx2_alg);
-	return 0;
-}
-
-static void unregister_sha1_avx2(void)
-{
-	if (avx2_usable())
-		crypto_unregister_shash(&sha1_avx2_alg);
-}
-
-asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
-				  int rounds);
-
-static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	return sha1_update_x86(desc, data, len, sha1_ni_transform);
-}
-
-static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
-{
-	return sha1_finup(desc, data, len, out, sha1_ni_transform);
-}
-
-static struct shash_alg sha1_ni_alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_base_init,
-	.update		=	sha1_ni_update,
-	.finup		=	sha1_ni_finup,
-	.descsize	=	SHA1_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha1",
-		.cra_driver_name =	"sha1-ni",
-		.cra_priority	=	250,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	SHA1_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int register_sha1_ni(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SHA_NI))
-		return crypto_register_shash(&sha1_ni_alg);
-	return 0;
-}
-
-static void unregister_sha1_ni(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SHA_NI))
-		crypto_unregister_shash(&sha1_ni_alg);
-}
-
-static int __init sha1_ssse3_mod_init(void)
-{
-	if (!x86_match_cpu(module_cpu_ids))
-		return -ENODEV;
-
-	if (register_sha1_ssse3())
-		goto fail;
-
-	if (register_sha1_avx()) {
-		unregister_sha1_ssse3();
-		goto fail;
-	}
-
-	if (register_sha1_avx2()) {
-		unregister_sha1_avx();
-		unregister_sha1_ssse3();
-		goto fail;
-	}
-
-	if (register_sha1_ni()) {
-		unregister_sha1_avx2();
-		unregister_sha1_avx();
-		unregister_sha1_ssse3();
-		goto fail;
-	}
-
-	return 0;
-fail:
-	return -ENODEV;
-}
-
-static void __exit sha1_ssse3_mod_fini(void)
-{
-	unregister_sha1_ni();
-	unregister_sha1_avx2();
-	unregister_sha1_avx();
-	unregister_sha1_ssse3();
-}
-
-module_init(sha1_ssse3_mod_init);
-module_exit(sha1_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_ALIAS_CRYPTO("sha1-ssse3");
-MODULE_ALIAS_CRYPTO("sha1-avx");
-MODULE_ALIAS_CRYPTO("sha1-avx2");
-MODULE_ALIAS_CRYPTO("sha1-ni");
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 019034f2bb53..dc34441a80bd 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -152,6 +152,7 @@ config CRYPTO_LIB_SHA1_ARCH
 	default y if PPC
 	default y if S390
 	default y if SPARC64
+	default y if X86_64
 
 config CRYPTO_LIB_SHA256
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 7897da2de623..b09fdb7f4b4c 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -82,6 +82,9 @@ libsha1-y += powerpc/sha1-powerpc-asm.o
 libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
 endif
 libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o
+libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \
+			 x86/sha1-avx2-asm.o \
+			 x86/sha1-ni-asm.o
 endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
 
 ################################################################################
diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S
new file mode 100644
index 000000000000..91b2dc6db179
--- /dev/null
+++ b/lib/crypto/x86/sha1-avx2-asm.S
@@ -0,0 +1,697 @@
+/*
+ *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * void sha1_transform_avx2(struct sha1_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+
+#include <linux/linkage.h>
+
+#define	CTX	%rdi	/* arg1 */
+#define BUF	%rsi	/* arg2 */
+#define CNT	%rdx	/* arg3 */
+
+#define	REG_A	%ecx
+#define	REG_B	%esi
+#define	REG_C	%edi
+#define	REG_D	%eax
+#define	REG_E	%edx
+#define	REG_TB	%ebx
+#define	REG_TA	%r12d
+#define	REG_RA	%rcx
+#define	REG_RB	%rsi
+#define	REG_RC	%rdi
+#define	REG_RD	%rax
+#define	REG_RE	%rdx
+#define	REG_RTA	%r12
+#define	REG_RTB	%rbx
+#define	REG_T1	%r11d
+#define	xmm_mov	vmovups
+#define	avx2_zeroupper	vzeroupper
+#define	RND_F1	1
+#define	RND_F2	2
+#define	RND_F3	3
+
+.macro REGALLOC
+	.set A, REG_A
+	.set B, REG_B
+	.set C, REG_C
+	.set D, REG_D
+	.set E, REG_E
+	.set TB, REG_TB
+	.set TA, REG_TA
+
+	.set RA, REG_RA
+	.set RB, REG_RB
+	.set RC, REG_RC
+	.set RD, REG_RD
+	.set RE, REG_RE
+
+	.set RTA, REG_RTA
+	.set RTB, REG_RTB
+
+	.set T1, REG_T1
+.endm
+
+#define HASH_PTR	%r9
+#define BLOCKS_CTR	%r8
+#define BUFFER_PTR	%r10
+#define BUFFER_PTR2	%r13
+
+#define PRECALC_BUF	%r14
+#define WK_BUF		%r15
+
+#define W_TMP		%xmm0
+#define WY_TMP		%ymm0
+#define WY_TMP2		%ymm9
+
+# AVX2 variables
+#define WY0		%ymm3
+#define WY4		%ymm5
+#define WY08		%ymm7
+#define WY12		%ymm8
+#define WY16		%ymm12
+#define WY20		%ymm13
+#define WY24		%ymm14
+#define WY28		%ymm15
+
+#define YMM_SHUFB_BSWAP	%ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ *    - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE		(80*2*2 +16)
+
+#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+	.set WY_00, WY0
+	.set WY_04, WY4
+	.set WY_08, WY08
+	.set WY_12, WY12
+	.set WY_16, WY16
+	.set WY_20, WY20
+	.set WY_24, WY24
+	.set WY_28, WY28
+	.set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+	/* Rotate macros */
+	.set WY_32, WY_28
+	.set WY_28, WY_24
+	.set WY_24, WY_20
+	.set WY_20, WY_16
+	.set WY_16, WY_12
+	.set WY_12, WY_08
+	.set WY_08, WY_04
+	.set WY_04, WY_00
+	.set WY_00, WY_32
+
+	/* Define register aliases */
+	.set WY, WY_00
+	.set WY_minus_04, WY_04
+	.set WY_minus_08, WY_08
+	.set WY_minus_12, WY_12
+	.set WY_minus_16, WY_16
+	.set WY_minus_20, WY_20
+	.set WY_minus_24, WY_24
+	.set WY_minus_28, WY_28
+	.set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+	.if (i == 0) # Initialize and rotate registers
+		PRECALC_RESET_WY
+		PRECALC_ROTATE_WY
+	.endif
+
+	/* message scheduling pre-compute for rounds 0-15 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+	.elseif ((i & 7) == 1)
+		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+			 WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+	.elseif ((i & 7) == 4)
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+	.elseif ((i & 7) == 7)
+		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_16_31
+	/*
+	 * message scheduling pre-compute for rounds 16-31
+	 * calculating last 32 w[i] values in 8 XMM registers
+	 * pre-calculate K+w[i] values and store to mem
+	 * for later load by ALU add instruction
+	 *
+	 * "brute force" vectorization for rounds 16-31 only
+	 * due to w[i]->w[i-3] dependency
+	 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		/* w[i-14] */
+		vpalignr	$8, WY_minus_16, WY_minus_12, WY
+		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
+	.elseif ((i & 7) == 1)
+		vpxor	WY_minus_08, WY, WY
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpxor	WY_TMP, WY, WY
+		vpslldq	$12, WY, WY_TMP2
+	.elseif ((i & 7) == 3)
+		vpslld	$1, WY, WY_TMP
+		vpsrld	$31, WY, WY
+	.elseif ((i & 7) == 4)
+		vpor	WY, WY_TMP, WY_TMP
+		vpslld	$2, WY_TMP2, WY
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY_TMP2, WY_TMP2
+		vpxor	WY, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 7)
+		vpxor	WY_TMP2, WY_TMP, WY
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_32_79
+	/*
+	 * in SHA-1 specification:
+	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+	 * instead we do equal:
+	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+	 * allows more efficient vectorization
+	 * since w[i]=>w[i-3] dependency is broken
+	 */
+
+	.if   ((i & 7) == 0)
+	/*
+	 * blended AVX2 and ALU instruction scheduling
+	 * 1 vector iteration per 8 rounds
+	 */
+		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
+	.elseif ((i & 7) == 1)
+		/* W is W_minus_32 before xor */
+		vpxor	WY_minus_28, WY, WY
+	.elseif ((i & 7) == 2)
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 3)
+		vpxor	WY_TMP, WY, WY
+	.elseif ((i & 7) == 4)
+		vpslld	$2, WY, WY_TMP
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY, WY
+		vpor	WY, WY_TMP, WY
+	.elseif ((i & 7) == 7)
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC r, s
+	.set i, \r
+
+	.if (i < 40)
+		.set K_XMM, 32*0
+	.elseif (i < 80)
+		.set K_XMM, 32*1
+	.elseif (i < 120)
+		.set K_XMM, 32*2
+	.else
+		.set K_XMM, 32*3
+	.endif
+
+	.if (i<32)
+		PRECALC_00_15	\s
+	.elseif (i<64)
+		PRECALC_16_31	\s
+	.elseif (i < 160)
+		PRECALC_32_79	\s
+	.endif
+.endm
+
+.macro ROTATE_STATE
+	.set T_REG, E
+	.set E, D
+	.set D, C
+	.set C, B
+	.set B, TB
+	.set TB, A
+	.set A, T_REG
+
+	.set T_REG, RE
+	.set RE, RD
+	.set RD, RC
+	.set RC, RB
+	.set RB, RTB
+	.set RTB, RA
+	.set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+	.if (\f == RND_F1)
+		ROUND_F1	\r
+	.elseif (\f == RND_F2)
+		ROUND_F2	\r
+	.elseif (\f == RND_F3)
+		ROUND_F3	\r
+	.endif
+.endm
+
+.macro RR r
+	.set round_id, (\r % 80)
+
+	.if (round_id == 0)        /* Precalculate F for first round */
+		.set ROUND_FUNC, RND_F1
+		mov	B, TB
+
+		rorx	$(32-30), B, B    /* b>>>2 */
+		andn	D, TB, T1
+		and	C, TB
+		xor	T1, TB
+	.endif
+
+	RND_FUN ROUND_FUNC, \r
+	ROTATE_STATE
+
+	.if   (round_id == 18)
+		.set ROUND_FUNC, RND_F2
+	.elseif (round_id == 38)
+		.set ROUND_FUNC, RND_F3
+	.elseif (round_id == 58)
+		.set ROUND_FUNC, RND_F2
+	.endif
+
+	.set round_id, ( (\r+1) % 80)
+
+	RND_FUN ROUND_FUNC, (\r+1)
+	ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+	add	WK(\r), E
+
+	andn	C, A, T1			/* ~b&d */
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30),A, TB		/* b>>>2 for next round */
+
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	/*
+	 * Calculate F for the next round
+	 * (b & c) ^ andn[b, d]
+	 */
+	and	B, A			/* b&c */
+	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
+
+	lea	(RE,RTA), E		/* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+	add	WK(\r), E
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	/* Calculate F for the next round */
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	.if ((round_id) < 79)
+		rorx	$(32-30), A, TB	/* b>>>2 for next round */
+	.endif
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	.if ((round_id) < 79)
+		xor	B, A
+	.endif
+
+	add	TA, E			/* E += A >>> 5 */
+
+	.if ((round_id) < 79)
+		xor	C, A
+	.endif
+.endm
+
+.macro ROUND_F3 r
+	add	WK(\r), E
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	mov	B, T1
+	or	A, T1
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30), A, TB		/* b>>>2 for next round */
+
+	/* Calculate F for the next round
+	 * (b and c) or (d and (b or c))
+	 */
+	and	C, T1
+	and	B, A
+	or	T1, A
+
+	add	TA, E			/* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+	mov     \a, RTA
+	add     $\d, RTA
+	cmp     $\c, \b
+	cmovge  RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+	REGALLOC
+
+	mov	(HASH_PTR), A
+	mov	4(HASH_PTR), B
+	mov	8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+	mov	%rsp, PRECALC_BUF
+	lea	(2*4*80+32)(%rsp), WK_BUF
+
+	# Precalc WK for first 2 blocks
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+	.set i, 0
+	.rept    160
+		PRECALC i
+		.set i, i + 1
+	.endr
+
+	/* Go to next block if needed */
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+	xchg	WK_BUF, PRECALC_BUF
+
+	.align 32
+.L_loop:
+	/*
+	 * code loops through more than one block
+	 * we use K_BASE value as a signal of a last block,
+	 * it is set below by: cmovae BUFFER_PTR, K_BASE
+	 */
+	test BLOCKS_CTR, BLOCKS_CTR
+	jnz .L_begin
+	.align 32
+	jmp	.L_end
+	.align 32
+.L_begin:
+
+	/*
+	 * Do first block
+	 * rounds: 0,2,4,6,8
+	 */
+	.set j, 0
+	.rept 5
+		RR	j
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds:
+	 * 10,12,14,16,18
+	 * 20,22,24,26,28
+	 * 30,32,34,36,38
+	 * 40,42,44,46,48
+	 * 50,52,54,56,58
+	 */
+	.rept 25
+		RR	j
+		.set j, j+2
+	.endr
+
+	/* Update Counter */
+	sub $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+	/*
+	 * rounds
+	 * 60,62,64,66,68
+	 * 70,72,74,76,78
+	 */
+	.rept 10
+		RR	j
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	test	BLOCKS_CTR, BLOCKS_CTR
+	jz	.L_loop
+
+	mov	TB, B
+
+	/* Process second block */
+	/*
+	 * rounds
+	 *  0+80, 2+80, 4+80, 6+80, 8+80
+	 * 10+80,12+80,14+80,16+80,18+80
+	 */
+
+	.set j, 0
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds
+	 * 20+80,22+80,24+80,26+80,28+80
+	 * 30+80,32+80,34+80,36+80,38+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds
+	 * 40+80,42+80,44+80,46+80,48+80
+	 * 50+80,52+80,54+80,56+80,58+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/* update counter */
+	sub     $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+	/*
+	 * rounds
+	 * 60+80,62+80,64+80,66+80,68+80
+	 * 70+80,72+80,74+80,76+80,78+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	/* Reset state for AVX2 reg permutation */
+	mov	A, TA
+	mov	TB, A
+	mov	C, TB
+	mov	E, C
+	mov	D, B
+	mov	TA, D
+
+	REGALLOC
+
+	xchg	WK_BUF, PRECALC_BUF
+
+	jmp	.L_loop
+
+	.align 32
+.L_end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	SYM_FUNC_START(\name)
+
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	RESERVE_STACK  = (W_SIZE*4 + 8+24)
+
+	/* Align stack */
+	push	%rbp
+	mov	%rsp, %rbp
+	and	$~(0x20-1), %rsp
+	sub	$RESERVE_STACK, %rsp
+
+	avx2_zeroupper
+
+	/* Setup initial values */
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	mov	BUF, BUFFER_PTR2
+	mov	CNT, BLOCKS_CTR
+
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	avx2_zeroupper
+
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+
+	SYM_FUNC_END(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM     sha1_transform_avx2
diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
new file mode 100644
index 000000000000..3989b0642ff5
--- /dev/null
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -0,0 +1,299 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+/* gcc conversion */
+#define FRAME_SIZE	32	/* space for 2x16 bytes */
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define SHUF_MASK	%xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 block function
+ *
+ * This function takes a pointer to the current SHA-1 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process.  Once all blocks
+ * have been processed, the state is updated with the new state.  This function
+ * only processes complete blocks.  State initialization, buffering of partial
+ * blocks, and digest finalization are expected to be handled elsewhere.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(struct sha1_block_state *state,
+ *			  const u8 *data, size_t nblocks)
+ */
+.text
+SYM_FUNC_START(sha1_ni_transform)
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$FRAME_SIZE, %rsp
+	and		$~0xF, %rsp
+
+	shl		$6, NUM_BLKS		/* convert to bytes */
+	jz		.Ldone_hash
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/* load initial hash values */
+	pinsrd		$3, 1*16(DIGEST_PTR), E0
+	movdqu		0*16(DIGEST_PTR), ABCD
+	pand		UPPER_WORD_MASK(%rip), E0
+	pshufd		$0x1B, ABCD, ABCD
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		E0, (0*16)(%rsp)
+	movdqa		ABCD, (1*16)(%rsp)
+
+	/* Rounds 0-3 */
+	movdqu		0*16(DATA_PTR), MSG0
+	pshufb		SHUF_MASK, MSG0
+		paddd		MSG0, E0
+		movdqa		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+	movdqu		1*16(DATA_PTR), MSG1
+	pshufb		SHUF_MASK, MSG1
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+	movdqu		2*16(DATA_PTR), MSG2
+	pshufb		SHUF_MASK, MSG2
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 12-15 */
+	movdqu		3*16(DATA_PTR), MSG3
+	pshufb		SHUF_MASK, MSG3
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	pxor		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	(0*16)(%rsp), E0
+	paddd		(1*16)(%rsp), ABCD
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	pshufd		$0x1B, ABCD, ABCD
+	movdqu		ABCD, 0*16(DIGEST_PTR)
+	pextrd		$3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+	mov		%rbp, %rsp
+	pop		%rbp
+
+	RET
+SYM_FUNC_END(sha1_ni_transform)
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x000102030405060708090a0b0c0d0e0f
+
+.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
+UPPER_WORD_MASK:
+	.octa 0xFFFFFFFF000000000000000000000000
diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S
new file mode 100644
index 000000000000..78b48cb09c5b
--- /dev/null
+++ b/lib/crypto/x86/sha1-ssse3-and-avx.S
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ *    http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ *            Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ *   Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+
+#define CTX	%rdi	// arg1
+#define BUF	%rsi	// arg2
+#define CNT	%rdx	// arg3
+
+#define REG_A	%ecx
+#define REG_B	%esi
+#define REG_C	%edi
+#define REG_D	%r12d
+#define REG_E	%edx
+
+#define REG_T1	%eax
+#define REG_T2	%ebx
+
+#define K_BASE		%r8
+#define HASH_PTR	%r9
+#define BUFFER_PTR	%r10
+#define BUFFER_END	%r11
+
+#define W_TMP1	%xmm0
+#define W_TMP2	%xmm9
+
+#define W0	%xmm1
+#define W4	%xmm2
+#define W8	%xmm3
+#define W12	%xmm4
+#define W16	%xmm5
+#define W20	%xmm6
+#define W24	%xmm7
+#define W28	%xmm8
+
+#define XMM_SHUFB_BSWAP	%xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t)	(((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD	16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	SYM_FUNC_START(\name)
+
+	push	%rbx
+	push	%r12
+	push	%rbp
+	mov	%rsp, %rbp
+
+	sub	$64, %rsp		# allocate workspace
+	and	$~15, %rsp		# align stack
+
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	shl	$6, CNT			# multiply by 64
+	add	BUF, CNT
+	mov	CNT, BUFFER_END
+
+	lea	K_XMM_AR(%rip), K_BASE
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	# cleanup workspace
+	mov	$8, %ecx
+	mov	%rsp, %rdi
+	xor	%eax, %eax
+	rep stosq
+
+	mov	%rbp, %rsp		# deallocate workspace
+	pop	%rbp
+	pop	%r12
+	pop	%rbx
+	RET
+
+	SYM_FUNC_END(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+	INIT_REGALLOC
+
+	mov	  (HASH_PTR), A
+	mov	 4(HASH_PTR), B
+	mov	 8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+  .set i, 0
+  .rept W_PRECALC_AHEAD
+	W_PRECALC i
+    .set i, (i+1)
+  .endr
+
+.align 4
+1:
+	RR F1,A,B,C,D,E,0
+	RR F1,D,E,A,B,C,2
+	RR F1,B,C,D,E,A,4
+	RR F1,E,A,B,C,D,6
+	RR F1,C,D,E,A,B,8
+
+	RR F1,A,B,C,D,E,10
+	RR F1,D,E,A,B,C,12
+	RR F1,B,C,D,E,A,14
+	RR F1,E,A,B,C,D,16
+	RR F1,C,D,E,A,B,18
+
+	RR F2,A,B,C,D,E,20
+	RR F2,D,E,A,B,C,22
+	RR F2,B,C,D,E,A,24
+	RR F2,E,A,B,C,D,26
+	RR F2,C,D,E,A,B,28
+
+	RR F2,A,B,C,D,E,30
+	RR F2,D,E,A,B,C,32
+	RR F2,B,C,D,E,A,34
+	RR F2,E,A,B,C,D,36
+	RR F2,C,D,E,A,B,38
+
+	RR F3,A,B,C,D,E,40
+	RR F3,D,E,A,B,C,42
+	RR F3,B,C,D,E,A,44
+	RR F3,E,A,B,C,D,46
+	RR F3,C,D,E,A,B,48
+
+	RR F3,A,B,C,D,E,50
+	RR F3,D,E,A,B,C,52
+	RR F3,B,C,D,E,A,54
+	RR F3,E,A,B,C,D,56
+	RR F3,C,D,E,A,B,58
+
+	add	$64, BUFFER_PTR		# move to the next 64-byte block
+	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
+	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
+
+	RR F4,A,B,C,D,E,60
+	RR F4,D,E,A,B,C,62
+	RR F4,B,C,D,E,A,64
+	RR F4,E,A,B,C,D,66
+	RR F4,C,D,E,A,B,68
+
+	RR F4,A,B,C,D,E,70
+	RR F4,D,E,A,B,C,72
+	RR F4,B,C,D,E,A,74
+	RR F4,E,A,B,C,D,76
+	RR F4,C,D,E,A,B,78
+
+	UPDATE_HASH   (HASH_PTR), A
+	UPDATE_HASH  4(HASH_PTR), B
+	UPDATE_HASH  8(HASH_PTR), C
+	UPDATE_HASH 12(HASH_PTR), D
+	UPDATE_HASH 16(HASH_PTR), E
+
+	RESTORE_RENAMED_REGS
+	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
+	jne	1b
+.endm
+
+.macro INIT_REGALLOC
+  .set A, REG_A
+  .set B, REG_B
+  .set C, REG_C
+  .set D, REG_D
+  .set E, REG_E
+  .set T1, REG_T1
+  .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+	# order is important (REG_C is where it should be)
+	mov	B, REG_B
+	mov	D, REG_D
+	mov	A, REG_A
+	mov	E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES  a, b
+  .set _T, \a
+  .set \a, \b
+  .set \b, _T
+.endm
+
+.macro F1  b, c, d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	xor	\d, T1
+	and	\b, T1
+	xor	\d, T1
+.endm
+
+.macro F2  b, c, d
+	mov	\d, T1
+	SWAP_REG_NAMES \d, T1
+	xor	\c, T1
+	xor	\b, T1
+.endm
+
+.macro F3  b, c ,d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	mov	\b, T2
+	or	\b, T1
+	and	\c, T2
+	and	\d, T1
+	or	T2, T1
+.endm
+
+.macro F4  b, c, d
+	F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ *   t1 = F(b, c, d);   e += w(i)
+ *   e += t1;           b <<= 30;   d  += w(i+1);
+ *   t1 = F(a, b, c);
+ *   d += t1;           a <<= 5;
+ *   e += a;
+ *   t1 = e;            a >>= 7;
+ *   t1 <<= 5;
+ *   d += t1;
+ */
+.macro RR  F, a, b, c, d, e, round
+	add	WK(\round), \e
+	\F   \b, \c, \d		# t1 = F(b, c, d);
+	W_PRECALC (\round + W_PRECALC_AHEAD)
+	rol	$30, \b
+	add	T1, \e
+	add	WK(\round + 1), \d
+
+	\F   \a, \b, \c
+	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+	rol	$5, \a
+	add	\a, \e
+	add	T1, \d
+	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
+
+	mov	\e, T1
+	SWAP_REG_NAMES \e, T1
+
+	rol	$5, T1
+	add	T1, \d
+
+	# write:  \a, \b
+	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC  r
+  .set i, \r
+
+  .if (i < 20)
+    .set K_XMM, 0
+  .elseif (i < 40)
+    .set K_XMM, 16
+  .elseif (i < 60)
+    .set K_XMM, 32
+  .elseif (i < 80)
+    .set K_XMM, 48
+  .endif
+
+  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+    .set i, ((\r) % 80)	    # pre-compute for the next iteration
+    .if (i == 0)
+	W_PRECALC_RESET
+    .endif
+	W_PRECALC_00_15
+  .elseif (i<32)
+	W_PRECALC_16_31
+  .elseif (i < 80)   // rounds 32-79
+	W_PRECALC_32_79
+  .endif
+.endm
+
+.macro W_PRECALC_RESET
+  .set W,          W0
+  .set W_minus_04, W4
+  .set W_minus_08, W8
+  .set W_minus_12, W12
+  .set W_minus_16, W16
+  .set W_minus_20, W20
+  .set W_minus_24, W24
+  .set W_minus_28, W28
+  .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+  .set W_minus_32, W_minus_28
+  .set W_minus_28, W_minus_24
+  .set W_minus_24, W_minus_20
+  .set W_minus_20, W_minus_16
+  .set W_minus_16, W_minus_12
+  .set W_minus_12, W_minus_08
+  .set W_minus_08, W_minus_04
+  .set W_minus_04, W
+  .set W,          W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+	W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+	W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+	W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+  .if ((i & 3) == 0)
+	movdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	pshufb	XMM_SHUFB_BSWAP, W_TMP1
+	movdqa	W_TMP1, W
+  .elseif ((i & 3) == 2)
+	paddd	(K_BASE), W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa  W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ *   instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+  # blended scheduling of vector and scalar instruction streams, one 4-wide
+  # vector iteration / 4 scalar rounds
+  .if ((i & 3) == 0)
+	movdqa	W_minus_12, W
+	palignr	$8, W_minus_16, W	# w[i-14]
+	movdqa	W_minus_04, W_TMP1
+	psrldq	$4, W_TMP1		# w[i-3]
+	pxor	W_minus_08, W
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W_TMP1
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP2
+	movdqa	W, W_TMP1
+	pslldq	$12, W_TMP2
+  .elseif ((i & 3) == 2)
+	psrld	$31, W
+	pslld	$1, W_TMP1
+	por	W, W_TMP1
+	movdqa	W_TMP2, W
+	psrld	$30, W_TMP2
+	pslld	$2, W
+  .elseif ((i & 3) == 3)
+	pxor	W, W_TMP1
+	pxor	W_TMP2, W_TMP1
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+  .if ((i & 3) == 0)
+	movdqa	W_minus_04, W_TMP1
+	pxor	W_minus_28, W		# W is W_minus_32 before xor
+	palignr	$8, W_minus_08, W_TMP1
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP1
+  .elseif ((i & 3) == 2)
+	psrld	$30, W
+	pslld	$2, W_TMP1
+	por	W, W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm		// W_PRECALC_SSSE3
+
+
+#define K1	0x5a827999
+#define K2	0x6ed9eba1
+#define K3	0x8f1bbcdc
+#define K4	0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+	movdqu	\a,\b
+.endm
+
+/*
+ * SSSE3 optimized implementation:
+ *
+ * void sha1_transform_ssse3(struct sha1_block_state *state,
+ *			     const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM     sha1_transform_ssse3
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro  W_PRECALC_00_15
+    W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro  W_PRECALC_16_31
+    W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro  W_PRECALC_32_79
+    W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+  .if ((i & 3) == 0)
+	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
+  .elseif ((i & 3) == 2)
+	vpaddd	(K_BASE), W, W_TMP1
+  .elseif ((i & 3) == 3)
+	vmovdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
+	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
+	vpxor	W_minus_08, W, W
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+  .elseif ((i & 3) == 1)
+	vpxor	W_TMP1, W, W
+	vpslldq	$12, W, W_TMP2
+	vpslld	$1, W, W_TMP1
+  .elseif ((i & 3) == 2)
+	vpsrld	$31, W, W
+	vpor	W, W_TMP1, W_TMP1
+	vpslld	$2, W_TMP2, W
+	vpsrld	$30, W_TMP2, W_TMP2
+  .elseif ((i & 3) == 3)
+	vpxor	W, W_TMP1, W_TMP1
+	vpxor	W_TMP2, W_TMP1, W
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
+  .elseif ((i & 3) == 1)
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+	vpxor	W_TMP1, W, W
+  .elseif ((i & 3) == 2)
+	vpslld	$2, W, W_TMP1
+	vpsrld	$30, W, W
+	vpor	W, W_TMP1, W
+  .elseif ((i & 3) == 3)
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm    // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+	vmovdqu	\a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * void sha1_transform_avx(struct sha1_block_state *state,
+ *			   const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM     sha1_transform_avx
diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h
new file mode 100644
index 000000000000..e308379d89bc
--- /dev/null
+++ b/lib/crypto/x86/sha1.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);
+
+#define DEFINE_X86_SHA1_FN(c_fn, asm_fn)                           \
+	asmlinkage void asm_fn(struct sha1_block_state *state,     \
+			       const u8 *data, size_t nblocks);    \
+	static void c_fn(struct sha1_block_state *state,           \
+			 const u8 *data, size_t nblocks)           \
+	{                                                          \
+		if (likely(irq_fpu_usable())) {                    \
+			kernel_fpu_begin();                        \
+			asm_fn(state, data, nblocks);              \
+			kernel_fpu_end();                          \
+		} else {                                           \
+			sha1_blocks_generic(state, data, nblocks); \
+		}                                                  \
+	}
+
+DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
+DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
+DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);
+
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
+				    const u8 *data, size_t nblocks);
+static void sha1_blocks_avx2(struct sha1_block_state *state,
+			     const u8 *data, size_t nblocks)
+{
+	if (likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		/* Select the optimal transform based on the number of blocks */
+		if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
+			sha1_transform_avx2(state, data, nblocks);
+		else
+			sha1_transform_avx(state, data, nblocks);
+		kernel_fpu_end();
+	} else {
+		sha1_blocks_generic(state, data, nblocks);
+	}
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	static_call(sha1_blocks_x86)(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha1_blocks_x86, sha1_blocks_ni);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				     NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI1) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
+		else
+			static_call_update(sha1_blocks_x86, sha1_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
+	}
+}