| /* |
| * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental |
| * SSE3 instruction set extensions introduced in Intel Core Microarchitecture |
| * processors. CPUs supporting Intel(R) AVX extensions will get an additional |
| * boost. |
| * |
| * This work was inspired by the vectorized implementation of Dean Gaudet. |
| * Additional information on it can be found at: |
| * http://www.arctic.org/~dean/crypto/sha1.html |
| * |
| * It was improved upon with more efficient vectorization of the message |
| * scheduling. This implementation has also been optimized for all current and |
| * several future generations of Intel CPUs. |
| * |
| * See this article for more information about the implementation details: |
| * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ |
| * |
| * Copyright (C) 2010, Intel Corp. |
| * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> |
| * Ronen Zohar <ronen.zohar@intel.com> |
| * |
| * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: |
| * Author: Mathias Krause <minipli@googlemail.com> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2 of the License, or |
| * (at your option) any later version. |
| */ |
| |
| #include <linux/linkage.h> |
| |
| #define CTX %rdi // arg1 |
| #define BUF %rsi // arg2 |
| #define CNT %rdx // arg3 |
| |
| #define REG_A %ecx |
| #define REG_B %esi |
| #define REG_C %edi |
| #define REG_D %r12d |
| #define REG_E %edx |
| |
| #define REG_T1 %eax |
| #define REG_T2 %ebx |
| |
| #define K_BASE %r8 |
| #define HASH_PTR %r9 |
| #define BUFFER_PTR %r10 |
| #define BUFFER_END %r11 |
| |
| #define W_TMP1 %xmm0 |
| #define W_TMP2 %xmm9 |
| |
| #define W0 %xmm1 |
| #define W4 %xmm2 |
| #define W8 %xmm3 |
| #define W12 %xmm4 |
| #define W16 %xmm5 |
| #define W20 %xmm6 |
| #define W24 %xmm7 |
| #define W28 %xmm8 |
| |
| #define XMM_SHUFB_BSWAP %xmm10 |
| |
| /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ |
| #define WK(t) (((t) & 15) * 4)(%rsp) |
| #define W_PRECALC_AHEAD 16 |
| |
| /* |
| * This macro implements the SHA-1 function's body for single 64-byte block |
| * param: function's name |
| */ |
| .macro SHA1_VECTOR_ASM name |
| ENTRY(\name) |
| |
| push %rbx |
| push %r12 |
| push %rbp |
| mov %rsp, %rbp |
| |
| sub $64, %rsp # allocate workspace |
| and $~15, %rsp # align stack |
| |
| mov CTX, HASH_PTR |
| mov BUF, BUFFER_PTR |
| |
| shl $6, CNT # multiply by 64 |
| add BUF, CNT |
| mov CNT, BUFFER_END |
| |
| lea K_XMM_AR(%rip), K_BASE |
| xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP |
| |
| SHA1_PIPELINED_MAIN_BODY |
| |
| # cleanup workspace |
| mov $8, %ecx |
| mov %rsp, %rdi |
| xor %rax, %rax |
| rep stosq |
| |
| mov %rbp, %rsp # deallocate workspace |
| pop %rbp |
| pop %r12 |
| pop %rbx |
| ret |
| |
| ENDPROC(\name) |
| .endm |
| |
| /* |
| * This macro implements 80 rounds of SHA-1 for one 64-byte block |
| */ |
| .macro SHA1_PIPELINED_MAIN_BODY |
| INIT_REGALLOC |
| |
| mov (HASH_PTR), A |
| mov 4(HASH_PTR), B |
| mov 8(HASH_PTR), C |
| mov 12(HASH_PTR), D |
| mov 16(HASH_PTR), E |
| |
| .set i, 0 |
| .rept W_PRECALC_AHEAD |
| W_PRECALC i |
| .set i, (i+1) |
| .endr |
| |
| .align 4 |
| 1: |
| RR F1,A,B,C,D,E,0 |
| RR F1,D,E,A,B,C,2 |
| RR F1,B,C,D,E,A,4 |
| RR F1,E,A,B,C,D,6 |
| RR F1,C,D,E,A,B,8 |
| |
| RR F1,A,B,C,D,E,10 |
| RR F1,D,E,A,B,C,12 |
| RR F1,B,C,D,E,A,14 |
| RR F1,E,A,B,C,D,16 |
| RR F1,C,D,E,A,B,18 |
| |
| RR F2,A,B,C,D,E,20 |
| RR F2,D,E,A,B,C,22 |
| RR F2,B,C,D,E,A,24 |
| RR F2,E,A,B,C,D,26 |
| RR F2,C,D,E,A,B,28 |
| |
| RR F2,A,B,C,D,E,30 |
| RR F2,D,E,A,B,C,32 |
| RR F2,B,C,D,E,A,34 |
| RR F2,E,A,B,C,D,36 |
| RR F2,C,D,E,A,B,38 |
| |
| RR F3,A,B,C,D,E,40 |
| RR F3,D,E,A,B,C,42 |
| RR F3,B,C,D,E,A,44 |
| RR F3,E,A,B,C,D,46 |
| RR F3,C,D,E,A,B,48 |
| |
| RR F3,A,B,C,D,E,50 |
| RR F3,D,E,A,B,C,52 |
| RR F3,B,C,D,E,A,54 |
| RR F3,E,A,B,C,D,56 |
| RR F3,C,D,E,A,B,58 |
| |
| add $64, BUFFER_PTR # move to the next 64-byte block |
| cmp BUFFER_END, BUFFER_PTR # if the current is the last one use |
| cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun |
| |
| RR F4,A,B,C,D,E,60 |
| RR F4,D,E,A,B,C,62 |
| RR F4,B,C,D,E,A,64 |
| RR F4,E,A,B,C,D,66 |
| RR F4,C,D,E,A,B,68 |
| |
| RR F4,A,B,C,D,E,70 |
| RR F4,D,E,A,B,C,72 |
| RR F4,B,C,D,E,A,74 |
| RR F4,E,A,B,C,D,76 |
| RR F4,C,D,E,A,B,78 |
| |
| UPDATE_HASH (HASH_PTR), A |
| UPDATE_HASH 4(HASH_PTR), B |
| UPDATE_HASH 8(HASH_PTR), C |
| UPDATE_HASH 12(HASH_PTR), D |
| UPDATE_HASH 16(HASH_PTR), E |
| |
| RESTORE_RENAMED_REGS |
| cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end |
| jne 1b |
| .endm |
| |
| .macro INIT_REGALLOC |
| .set A, REG_A |
| .set B, REG_B |
| .set C, REG_C |
| .set D, REG_D |
| .set E, REG_E |
| .set T1, REG_T1 |
| .set T2, REG_T2 |
| .endm |
| |
| .macro RESTORE_RENAMED_REGS |
| # order is important (REG_C is where it should be) |
| mov B, REG_B |
| mov D, REG_D |
| mov A, REG_A |
| mov E, REG_E |
| .endm |
| |
| .macro SWAP_REG_NAMES a, b |
| .set _T, \a |
| .set \a, \b |
| .set \b, _T |
| .endm |
| |
| .macro F1 b, c, d |
| mov \c, T1 |
| SWAP_REG_NAMES \c, T1 |
| xor \d, T1 |
| and \b, T1 |
| xor \d, T1 |
| .endm |
| |
| .macro F2 b, c, d |
| mov \d, T1 |
| SWAP_REG_NAMES \d, T1 |
| xor \c, T1 |
| xor \b, T1 |
| .endm |
| |
| .macro F3 b, c ,d |
| mov \c, T1 |
| SWAP_REG_NAMES \c, T1 |
| mov \b, T2 |
| or \b, T1 |
| and \c, T2 |
| and \d, T1 |
| or T2, T1 |
| .endm |
| |
| .macro F4 b, c, d |
| F2 \b, \c, \d |
| .endm |
| |
| .macro UPDATE_HASH hash, val |
| add \hash, \val |
| mov \val, \hash |
| .endm |
| |
| /* |
| * RR does two rounds of SHA-1 back to back with W[] pre-calc |
| * t1 = F(b, c, d); e += w(i) |
| * e += t1; b <<= 30; d += w(i+1); |
| * t1 = F(a, b, c); |
| * d += t1; a <<= 5; |
| * e += a; |
| * t1 = e; a >>= 7; |
| * t1 <<= 5; |
| * d += t1; |
| */ |
| .macro RR F, a, b, c, d, e, round |
| add WK(\round), \e |
| \F \b, \c, \d # t1 = F(b, c, d); |
| W_PRECALC (\round + W_PRECALC_AHEAD) |
| rol $30, \b |
| add T1, \e |
| add WK(\round + 1), \d |
| |
| \F \a, \b, \c |
| W_PRECALC (\round + W_PRECALC_AHEAD + 1) |
| rol $5, \a |
| add \a, \e |
| add T1, \d |
| ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) |
| |
| mov \e, T1 |
| SWAP_REG_NAMES \e, T1 |
| |
| rol $5, T1 |
| add T1, \d |
| |
| # write: \a, \b |
| # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c |
| .endm |
| |
| .macro W_PRECALC r |
| .set i, \r |
| |
| .if (i < 20) |
| .set K_XMM, 0 |
| .elseif (i < 40) |
| .set K_XMM, 16 |
| .elseif (i < 60) |
| .set K_XMM, 32 |
| .elseif (i < 80) |
| .set K_XMM, 48 |
| .endif |
| |
| .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) |
| .set i, ((\r) % 80) # pre-compute for the next iteration |
| .if (i == 0) |
| W_PRECALC_RESET |
| .endif |
| W_PRECALC_00_15 |
| .elseif (i<32) |
| W_PRECALC_16_31 |
| .elseif (i < 80) // rounds 32-79 |
| W_PRECALC_32_79 |
| .endif |
| .endm |
| |
| .macro W_PRECALC_RESET |
| .set W, W0 |
| .set W_minus_04, W4 |
| .set W_minus_08, W8 |
| .set W_minus_12, W12 |
| .set W_minus_16, W16 |
| .set W_minus_20, W20 |
| .set W_minus_24, W24 |
| .set W_minus_28, W28 |
| .set W_minus_32, W |
| .endm |
| |
| .macro W_PRECALC_ROTATE |
| .set W_minus_32, W_minus_28 |
| .set W_minus_28, W_minus_24 |
| .set W_minus_24, W_minus_20 |
| .set W_minus_20, W_minus_16 |
| .set W_minus_16, W_minus_12 |
| .set W_minus_12, W_minus_08 |
| .set W_minus_08, W_minus_04 |
| .set W_minus_04, W |
| .set W, W_minus_32 |
| .endm |
| |
| .macro W_PRECALC_SSSE3 |
| |
| .macro W_PRECALC_00_15 |
| W_PRECALC_00_15_SSSE3 |
| .endm |
| .macro W_PRECALC_16_31 |
| W_PRECALC_16_31_SSSE3 |
| .endm |
| .macro W_PRECALC_32_79 |
| W_PRECALC_32_79_SSSE3 |
| .endm |
| |
| /* message scheduling pre-compute for rounds 0-15 */ |
| .macro W_PRECALC_00_15_SSSE3 |
| .if ((i & 3) == 0) |
| movdqu (i*4)(BUFFER_PTR), W_TMP1 |
| .elseif ((i & 3) == 1) |
| pshufb XMM_SHUFB_BSWAP, W_TMP1 |
| movdqa W_TMP1, W |
| .elseif ((i & 3) == 2) |
| paddd (K_BASE), W_TMP1 |
| .elseif ((i & 3) == 3) |
| movdqa W_TMP1, WK(i&~3) |
| W_PRECALC_ROTATE |
| .endif |
| .endm |
| |
| /* message scheduling pre-compute for rounds 16-31 |
| * |
| * - calculating last 32 w[i] values in 8 XMM registers |
| * - pre-calculate K+w[i] values and store to mem, for later load by ALU add |
| * instruction |
| * |
| * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] |
| * dependency, but improves for 32-79 |
| */ |
| .macro W_PRECALC_16_31_SSSE3 |
| # blended scheduling of vector and scalar instruction streams, one 4-wide |
| # vector iteration / 4 scalar rounds |
| .if ((i & 3) == 0) |
| movdqa W_minus_12, W |
| palignr $8, W_minus_16, W # w[i-14] |
| movdqa W_minus_04, W_TMP1 |
| psrldq $4, W_TMP1 # w[i-3] |
| pxor W_minus_08, W |
| .elseif ((i & 3) == 1) |
| pxor W_minus_16, W_TMP1 |
| pxor W_TMP1, W |
| movdqa W, W_TMP2 |
| movdqa W, W_TMP1 |
| pslldq $12, W_TMP2 |
| .elseif ((i & 3) == 2) |
| psrld $31, W |
| pslld $1, W_TMP1 |
| por W, W_TMP1 |
| movdqa W_TMP2, W |
| psrld $30, W_TMP2 |
| pslld $2, W |
| .elseif ((i & 3) == 3) |
| pxor W, W_TMP1 |
| pxor W_TMP2, W_TMP1 |
| movdqa W_TMP1, W |
| paddd K_XMM(K_BASE), W_TMP1 |
| movdqa W_TMP1, WK(i&~3) |
| W_PRECALC_ROTATE |
| .endif |
| .endm |
| |
| /* message scheduling pre-compute for rounds 32-79 |
| * |
| * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 |
| * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 |
| * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken |
| */ |
| .macro W_PRECALC_32_79_SSSE3 |
| .if ((i & 3) == 0) |
| movdqa W_minus_04, W_TMP1 |
| pxor W_minus_28, W # W is W_minus_32 before xor |
| palignr $8, W_minus_08, W_TMP1 |
| .elseif ((i & 3) == 1) |
| pxor W_minus_16, W |
| pxor W_TMP1, W |
| movdqa W, W_TMP1 |
| .elseif ((i & 3) == 2) |
| psrld $30, W |
| pslld $2, W_TMP1 |
| por W, W_TMP1 |
| .elseif ((i & 3) == 3) |
| movdqa W_TMP1, W |
| paddd K_XMM(K_BASE), W_TMP1 |
| movdqa W_TMP1, WK(i&~3) |
| W_PRECALC_ROTATE |
| .endif |
| .endm |
| |
| .endm // W_PRECALC_SSSE3 |
| |
| |
| #define K1 0x5a827999 |
| #define K2 0x6ed9eba1 |
| #define K3 0x8f1bbcdc |
| #define K4 0xca62c1d6 |
| |
| .section .rodata |
| .align 16 |
| |
| K_XMM_AR: |
| .long K1, K1, K1, K1 |
| .long K2, K2, K2, K2 |
| .long K3, K3, K3, K3 |
| .long K4, K4, K4, K4 |
| |
| BSWAP_SHUFB_CTL: |
| .long 0x00010203 |
| .long 0x04050607 |
| .long 0x08090a0b |
| .long 0x0c0d0e0f |
| |
| |
| .section .text |
| |
| W_PRECALC_SSSE3 |
| .macro xmm_mov a, b |
| movdqu \a,\b |
| .endm |
| |
| /* |
| * SSSE3 optimized implementation: |
| * |
| * extern "C" void sha1_transform_ssse3(struct sha1_state *state, |
| * const u8 *data, int blocks); |
| * |
| * Note that struct sha1_state is assumed to begin with u32 state[5]. |
| */ |
| SHA1_VECTOR_ASM sha1_transform_ssse3 |
| |
| #ifdef CONFIG_AS_AVX |
| |
| .macro W_PRECALC_AVX |
| |
| .purgem W_PRECALC_00_15 |
| .macro W_PRECALC_00_15 |
| W_PRECALC_00_15_AVX |
| .endm |
| .purgem W_PRECALC_16_31 |
| .macro W_PRECALC_16_31 |
| W_PRECALC_16_31_AVX |
| .endm |
| .purgem W_PRECALC_32_79 |
| .macro W_PRECALC_32_79 |
| W_PRECALC_32_79_AVX |
| .endm |
| |
| .macro W_PRECALC_00_15_AVX |
| .if ((i & 3) == 0) |
| vmovdqu (i*4)(BUFFER_PTR), W_TMP1 |
| .elseif ((i & 3) == 1) |
| vpshufb XMM_SHUFB_BSWAP, W_TMP1, W |
| .elseif ((i & 3) == 2) |
| vpaddd (K_BASE), W, W_TMP1 |
| .elseif ((i & 3) == 3) |
| vmovdqa W_TMP1, WK(i&~3) |
| W_PRECALC_ROTATE |
| .endif |
| .endm |
| |
| .macro W_PRECALC_16_31_AVX |
| .if ((i & 3) == 0) |
| vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] |
| vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] |
| vpxor W_minus_08, W, W |
| vpxor W_minus_16, W_TMP1, W_TMP1 |
| .elseif ((i & 3) == 1) |
| vpxor W_TMP1, W, W |
| vpslldq $12, W, W_TMP2 |
| vpslld $1, W, W_TMP1 |
| .elseif ((i & 3) == 2) |
| vpsrld $31, W, W |
| vpor W, W_TMP1, W_TMP1 |
| vpslld $2, W_TMP2, W |
| vpsrld $30, W_TMP2, W_TMP2 |
| .elseif ((i & 3) == 3) |
| vpxor W, W_TMP1, W_TMP1 |
| vpxor W_TMP2, W_TMP1, W |
| vpaddd K_XMM(K_BASE), W, W_TMP1 |
| vmovdqu W_TMP1, WK(i&~3) |
| W_PRECALC_ROTATE |
| .endif |
| .endm |
| |
| .macro W_PRECALC_32_79_AVX |
| .if ((i & 3) == 0) |
| vpalignr $8, W_minus_08, W_minus_04, W_TMP1 |
| vpxor W_minus_28, W, W # W is W_minus_32 before xor |
| .elseif ((i & 3) == 1) |
| vpxor W_minus_16, W_TMP1, W_TMP1 |
| vpxor W_TMP1, W, W |
| .elseif ((i & 3) == 2) |
| vpslld $2, W, W_TMP1 |
| vpsrld $30, W, W |
| vpor W, W_TMP1, W |
| .elseif ((i & 3) == 3) |
| vpaddd K_XMM(K_BASE), W, W_TMP1 |
| vmovdqu W_TMP1, WK(i&~3) |
| W_PRECALC_ROTATE |
| .endif |
| .endm |
| |
| .endm // W_PRECALC_AVX |
| |
| W_PRECALC_AVX |
| .purgem xmm_mov |
| .macro xmm_mov a, b |
| vmovdqu \a,\b |
| .endm |
| |
| |
| /* AVX optimized implementation: |
| * extern "C" void sha1_transform_avx(struct sha1_state *state, |
| * const u8 *data, int blocks); |
| */ |
| SHA1_VECTOR_ASM sha1_transform_avx |
| |
| #endif |