diff options
-rw-r--r-- | arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 | ||||
-rw-r--r-- | arch/x86/crypto/sha1_ssse3_glue.c | 2 |
2 files changed, 37 insertions, 32 deletions
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792db15ef..1eab79c9ac48 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S | |||
@@ -117,11 +117,10 @@ | |||
117 | .set T1, REG_T1 | 117 | .set T1, REG_T1 |
118 | .endm | 118 | .endm |
119 | 119 | ||
120 | #define K_BASE %r8 | ||
121 | #define HASH_PTR %r9 | 120 | #define HASH_PTR %r9 |
121 | #define BLOCKS_CTR %r8 | ||
122 | #define BUFFER_PTR %r10 | 122 | #define BUFFER_PTR %r10 |
123 | #define BUFFER_PTR2 %r13 | 123 | #define BUFFER_PTR2 %r13 |
124 | #define BUFFER_END %r11 | ||
125 | 124 | ||
126 | #define PRECALC_BUF %r14 | 125 | #define PRECALC_BUF %r14 |
127 | #define WK_BUF %r15 | 126 | #define WK_BUF %r15 |
@@ -205,14 +204,14 @@ | |||
205 | * blended AVX2 and ALU instruction scheduling | 204 | * blended AVX2 and ALU instruction scheduling |
206 | * 1 vector iteration per 8 rounds | 205 | * 1 vector iteration per 8 rounds |
207 | */ | 206 | */ |
208 | vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP | 207 | vmovdqu (i * 2)(BUFFER_PTR), W_TMP |
209 | .elseif ((i & 7) == 1) | 208 | .elseif ((i & 7) == 1) |
210 | vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ | 209 | vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ |
211 | WY_TMP, WY_TMP | 210 | WY_TMP, WY_TMP |
212 | .elseif ((i & 7) == 2) | 211 | .elseif ((i & 7) == 2) |
213 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY | 212 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY |
214 | .elseif ((i & 7) == 4) | 213 | .elseif ((i & 7) == 4) |
215 | vpaddd K_XMM(K_BASE), WY, WY_TMP | 214 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP |
216 | .elseif ((i & 7) == 7) | 215 | .elseif ((i & 7) == 7) |
217 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | 216 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
218 | 217 | ||
@@ -255,7 +254,7 @@ | |||
255 | vpxor WY, WY_TMP, WY_TMP | 254 | vpxor WY, WY_TMP, WY_TMP |
256 | .elseif ((i & 7) == 7) | 255 | .elseif ((i & 7) == 7) |
257 | vpxor WY_TMP2, WY_TMP, WY | 256 | vpxor WY_TMP2, WY_TMP, WY |
258 | vpaddd K_XMM(K_BASE), WY, WY_TMP | 257 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP |
259 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | 258 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
260 | 259 | ||
261 | PRECALC_ROTATE_WY | 260 | PRECALC_ROTATE_WY |
@@ -291,7 +290,7 @@ | |||
291 | vpsrld $30, WY, WY | 290 | vpsrld $30, WY, WY |
292 | vpor WY, WY_TMP, WY | 291 | vpor WY, WY_TMP, WY |
293 | .elseif ((i & 7) == 7) | 292 | .elseif ((i & 7) == 7) |
294 | vpaddd K_XMM(K_BASE), WY, WY_TMP | 293 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP |
295 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | 294 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
296 | 295 | ||
297 | PRECALC_ROTATE_WY | 296 | PRECALC_ROTATE_WY |
@@ -446,6 +445,16 @@ | |||
446 | 445 | ||
447 | .endm | 446 | .endm |
448 | 447 | ||
448 | /* Add constant only if (%2 > %3) condition met (uses RTA as temp) | ||
449 | * %1 + %2 >= %3 ? %4 : 0 | ||
450 | */ | ||
451 | .macro ADD_IF_GE a, b, c, d | ||
452 | mov \a, RTA | ||
453 | add $\d, RTA | ||
454 | cmp $\c, \b | ||
455 | cmovge RTA, \a | ||
456 | .endm | ||
457 | |||
449 | /* | 458 | /* |
450 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining | 459 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining |
451 | */ | 460 | */ |
@@ -463,13 +472,16 @@ | |||
463 | lea (2*4*80+32)(%rsp), WK_BUF | 472 | lea (2*4*80+32)(%rsp), WK_BUF |
464 | 473 | ||
465 | # Precalc WK for first 2 blocks | 474 | # Precalc WK for first 2 blocks |
466 | PRECALC_OFFSET = 0 | 475 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 |
467 | .set i, 0 | 476 | .set i, 0 |
468 | .rept 160 | 477 | .rept 160 |
469 | PRECALC i | 478 | PRECALC i |
470 | .set i, i + 1 | 479 | .set i, i + 1 |
471 | .endr | 480 | .endr |
472 | PRECALC_OFFSET = 128 | 481 | |
482 | /* Go to next block if needed */ | ||
483 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 | ||
484 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 | ||
473 | xchg WK_BUF, PRECALC_BUF | 485 | xchg WK_BUF, PRECALC_BUF |
474 | 486 | ||
475 | .align 32 | 487 | .align 32 |
@@ -479,8 +491,8 @@ _loop: | |||
479 | * we use K_BASE value as a signal of a last block, | 491 | * we use K_BASE value as a signal of a last block, |
480 | * it is set below by: cmovae BUFFER_PTR, K_BASE | 492 | * it is set below by: cmovae BUFFER_PTR, K_BASE |
481 | */ | 493 | */ |
482 | cmp K_BASE, BUFFER_PTR | 494 | test BLOCKS_CTR, BLOCKS_CTR |
483 | jne _begin | 495 | jnz _begin |
484 | .align 32 | 496 | .align 32 |
485 | jmp _end | 497 | jmp _end |
486 | .align 32 | 498 | .align 32 |
@@ -512,10 +524,10 @@ _loop0: | |||
512 | .set j, j+2 | 524 | .set j, j+2 |
513 | .endr | 525 | .endr |
514 | 526 | ||
515 | add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ | 527 | /* Update Counter */ |
516 | cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ | 528 | sub $1, BLOCKS_CTR |
517 | cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ | 529 | /* Move to the next block only if needed*/ |
518 | 530 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 | |
519 | /* | 531 | /* |
520 | * rounds | 532 | * rounds |
521 | * 60,62,64,66,68 | 533 | * 60,62,64,66,68 |
@@ -532,8 +544,8 @@ _loop0: | |||
532 | UPDATE_HASH 12(HASH_PTR), D | 544 | UPDATE_HASH 12(HASH_PTR), D |
533 | UPDATE_HASH 16(HASH_PTR), E | 545 | UPDATE_HASH 16(HASH_PTR), E |
534 | 546 | ||
535 | cmp K_BASE, BUFFER_PTR /* is current block the last one? */ | 547 | test BLOCKS_CTR, BLOCKS_CTR |
536 | je _loop | 548 | jz _loop |
537 | 549 | ||
538 | mov TB, B | 550 | mov TB, B |
539 | 551 | ||
@@ -575,10 +587,10 @@ _loop2: | |||
575 | .set j, j+2 | 587 | .set j, j+2 |
576 | .endr | 588 | .endr |
577 | 589 | ||
578 | add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ | 590 | /* update counter */ |
579 | 591 | sub $1, BLOCKS_CTR | |
580 | cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ | 592 | /* Move to the next block only if needed*/ |
581 | cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ | 593 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 |
582 | 594 | ||
583 | jmp _loop3 | 595 | jmp _loop3 |
584 | _loop3: | 596 | _loop3: |
@@ -641,19 +653,12 @@ _loop3: | |||
641 | 653 | ||
642 | avx2_zeroupper | 654 | avx2_zeroupper |
643 | 655 | ||
644 | lea K_XMM_AR(%rip), K_BASE | 656 | /* Setup initial values */ |
645 | |||
646 | mov CTX, HASH_PTR | 657 | mov CTX, HASH_PTR |
647 | mov BUF, BUFFER_PTR | 658 | mov BUF, BUFFER_PTR |
648 | lea 64(BUF), BUFFER_PTR2 | ||
649 | |||
650 | shl $6, CNT /* mul by 64 */ | ||
651 | add BUF, CNT | ||
652 | add $64, CNT | ||
653 | mov CNT, BUFFER_END | ||
654 | 659 | ||
655 | cmp BUFFER_END, BUFFER_PTR2 | 660 | mov BUF, BUFFER_PTR2 |
656 | cmovae K_BASE, BUFFER_PTR2 | 661 | mov CNT, BLOCKS_CTR |
657 | 662 | ||
658 | xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP | 663 | xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP |
659 | 664 | ||
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7de207a11014..dd14616b7739 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c | |||
@@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, | |||
201 | 201 | ||
202 | static bool avx2_usable(void) | 202 | static bool avx2_usable(void) |
203 | { | 203 | { |
204 | if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) | 204 | if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) |
205 | && boot_cpu_has(X86_FEATURE_BMI1) | 205 | && boot_cpu_has(X86_FEATURE_BMI1) |
206 | && boot_cpu_has(X86_FEATURE_BMI2)) | 206 | && boot_cpu_has(X86_FEATURE_BMI2)) |
207 | return true; | 207 | return true; |