aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S67
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c2
2 files changed, 37 insertions, 32 deletions
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1cd792db15ef..1eab79c9ac48 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -117,11 +117,10 @@
117 .set T1, REG_T1 117 .set T1, REG_T1
118.endm 118.endm
119 119
120#define K_BASE %r8
121#define HASH_PTR %r9 120#define HASH_PTR %r9
121#define BLOCKS_CTR %r8
122#define BUFFER_PTR %r10 122#define BUFFER_PTR %r10
123#define BUFFER_PTR2 %r13 123#define BUFFER_PTR2 %r13
124#define BUFFER_END %r11
125 124
126#define PRECALC_BUF %r14 125#define PRECALC_BUF %r14
127#define WK_BUF %r15 126#define WK_BUF %r15
@@ -205,14 +204,14 @@
205 * blended AVX2 and ALU instruction scheduling 204 * blended AVX2 and ALU instruction scheduling
206 * 1 vector iteration per 8 rounds 205 * 1 vector iteration per 8 rounds
207 */ 206 */
208 vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP 207 vmovdqu (i * 2)(BUFFER_PTR), W_TMP
209 .elseif ((i & 7) == 1) 208 .elseif ((i & 7) == 1)
210 vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ 209 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
211 WY_TMP, WY_TMP 210 WY_TMP, WY_TMP
212 .elseif ((i & 7) == 2) 211 .elseif ((i & 7) == 2)
213 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY 212 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214 .elseif ((i & 7) == 4) 213 .elseif ((i & 7) == 4)
215 vpaddd K_XMM(K_BASE), WY, WY_TMP 214 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
216 .elseif ((i & 7) == 7) 215 .elseif ((i & 7) == 7)
217 vmovdqu WY_TMP, PRECALC_WK(i&~7) 216 vmovdqu WY_TMP, PRECALC_WK(i&~7)
218 217
@@ -255,7 +254,7 @@
255 vpxor WY, WY_TMP, WY_TMP 254 vpxor WY, WY_TMP, WY_TMP
256 .elseif ((i & 7) == 7) 255 .elseif ((i & 7) == 7)
257 vpxor WY_TMP2, WY_TMP, WY 256 vpxor WY_TMP2, WY_TMP, WY
258 vpaddd K_XMM(K_BASE), WY, WY_TMP 257 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
259 vmovdqu WY_TMP, PRECALC_WK(i&~7) 258 vmovdqu WY_TMP, PRECALC_WK(i&~7)
260 259
261 PRECALC_ROTATE_WY 260 PRECALC_ROTATE_WY
@@ -291,7 +290,7 @@
291 vpsrld $30, WY, WY 290 vpsrld $30, WY, WY
292 vpor WY, WY_TMP, WY 291 vpor WY, WY_TMP, WY
293 .elseif ((i & 7) == 7) 292 .elseif ((i & 7) == 7)
294 vpaddd K_XMM(K_BASE), WY, WY_TMP 293 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
295 vmovdqu WY_TMP, PRECALC_WK(i&~7) 294 vmovdqu WY_TMP, PRECALC_WK(i&~7)
296 295
297 PRECALC_ROTATE_WY 296 PRECALC_ROTATE_WY
@@ -446,6 +445,16 @@
446 445
447.endm 446.endm
448 447
448/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
449 * %1 + %2 >= %3 ? %4 : 0
450 */
451.macro ADD_IF_GE a, b, c, d
452 mov \a, RTA
453 add $\d, RTA
454 cmp $\c, \b
455 cmovge RTA, \a
456.endm
457
449/* 458/*
450 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining 459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
451 */ 460 */
@@ -463,13 +472,16 @@
463 lea (2*4*80+32)(%rsp), WK_BUF 472 lea (2*4*80+32)(%rsp), WK_BUF
464 473
465 # Precalc WK for first 2 blocks 474 # Precalc WK for first 2 blocks
466 PRECALC_OFFSET = 0 475 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
467 .set i, 0 476 .set i, 0
468 .rept 160 477 .rept 160
469 PRECALC i 478 PRECALC i
470 .set i, i + 1 479 .set i, i + 1
471 .endr 480 .endr
472 PRECALC_OFFSET = 128 481
482 /* Go to next block if needed */
483 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
484 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
473 xchg WK_BUF, PRECALC_BUF 485 xchg WK_BUF, PRECALC_BUF
474 486
475 .align 32 487 .align 32
@@ -479,8 +491,8 @@ _loop:
479 * we use K_BASE value as a signal of a last block, 491 * we use K_BASE value as a signal of a last block,
480 * it is set below by: cmovae BUFFER_PTR, K_BASE 492 * it is set below by: cmovae BUFFER_PTR, K_BASE
481 */ 493 */
482 cmp K_BASE, BUFFER_PTR 494 test BLOCKS_CTR, BLOCKS_CTR
483 jne _begin 495 jnz _begin
484 .align 32 496 .align 32
485 jmp _end 497 jmp _end
486 .align 32 498 .align 32
@@ -512,10 +524,10 @@ _loop0:
512 .set j, j+2 524 .set j, j+2
513 .endr 525 .endr
514 526
515 add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ 527 /* Update Counter */
516 cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ 528 sub $1, BLOCKS_CTR
517 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ 529 /* Move to the next block only if needed*/
518 530 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
519 /* 531 /*
520 * rounds 532 * rounds
521 * 60,62,64,66,68 533 * 60,62,64,66,68
@@ -532,8 +544,8 @@ _loop0:
532 UPDATE_HASH 12(HASH_PTR), D 544 UPDATE_HASH 12(HASH_PTR), D
533 UPDATE_HASH 16(HASH_PTR), E 545 UPDATE_HASH 16(HASH_PTR), E
534 546
535 cmp K_BASE, BUFFER_PTR /* is current block the last one? */ 547 test BLOCKS_CTR, BLOCKS_CTR
536 je _loop 548 jz _loop
537 549
538 mov TB, B 550 mov TB, B
539 551
@@ -575,10 +587,10 @@ _loop2:
575 .set j, j+2 587 .set j, j+2
576 .endr 588 .endr
577 589
578 add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ 590 /* update counter */
579 591 sub $1, BLOCKS_CTR
580 cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ 592 /* Move to the next block only if needed*/
581 cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ 593 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
582 594
583 jmp _loop3 595 jmp _loop3
584_loop3: 596_loop3:
@@ -641,19 +653,12 @@ _loop3:
641 653
642 avx2_zeroupper 654 avx2_zeroupper
643 655
644 lea K_XMM_AR(%rip), K_BASE 656 /* Setup initial values */
645
646 mov CTX, HASH_PTR 657 mov CTX, HASH_PTR
647 mov BUF, BUFFER_PTR 658 mov BUF, BUFFER_PTR
648 lea 64(BUF), BUFFER_PTR2
649
650 shl $6, CNT /* mul by 64 */
651 add BUF, CNT
652 add $64, CNT
653 mov CNT, BUFFER_END
654 659
655 cmp BUFFER_END, BUFFER_PTR2 660 mov BUF, BUFFER_PTR2
656 cmovae K_BASE, BUFFER_PTR2 661 mov CNT, BLOCKS_CTR
657 662
658 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP 663 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
659 664
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7de207a11014..dd14616b7739 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
201 201
202static bool avx2_usable(void) 202static bool avx2_usable(void)
203{ 203{
204 if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) 204 if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
205 && boot_cpu_has(X86_FEATURE_BMI1) 205 && boot_cpu_has(X86_FEATURE_BMI1)
206 && boot_cpu_has(X86_FEATURE_BMI2)) 206 && boot_cpu_has(X86_FEATURE_BMI2))
207 return true; 207 return true;