| #!/usr/bin/env perl |
| # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
| # |
| # This code is taken from the OpenSSL project but the author, Andy Polyakov, |
| # has relicensed it under the licenses specified in the SPDX header above. |
| # The original headers, including the original license headers, are |
| # included below for completeness. |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # December 2014 |
| # |
| # ChaCha20 for ARMv4. |
| # |
| # September 2018 |
| # |
| # Improve scalar performance per Eric Biggers' suggestion to eliminate |
| # separate rotates. This requires b[0..3] and d[0..3] to be maintained |
| # pre-rotated, hence odd twists prior inner loop and when accumulating |
| # key material. Since amount of instructions is reduced as result, even |
| # NEON performance is improved somewhat, most notably by ~9% on low-end |
| # Cortex-A5/A7. Full unroll was shown to provide even better scalar |
| # performance on Cortex-A5/A7, naturally at the cost of manyfold size |
| # increase. We let it be. Oversized code works in benchmarks, but is not |
| # necessarily optimal in real life, when it's likely to be out-of-cache |
| # upon entry and evict significant part of cache upon completion. |
| # |
| # Performance in cycles per byte out of large buffer. |
| # |
| # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU |
| # |
| # Cortex-A5 14.2(*)/+160% 21.8 12.9(**) |
| # Cortex-A8 10.2(*)/+190% 13.9 6.10 |
| # Cortex-A9 10.8(*)/+150% 14.3 6.50 |
| # Cortex-A15 11.0/+40% 16.0 4.90 |
| # Snapdragon S4 13.9(***)/+90% 13.6 4.90 |
| # |
| # (*) most "favourable" result for aligned data on little-endian |
| # processor, result for misaligned data is 10-15% lower; |
| # (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8% |
| # better performance on Cortex-A5/A7, but not on others; |
| # (***) it's 17% slower than original, trade-off is considered |
| # acceptable, because of improvement on others, specifically |
| # +36% on Cortex-A5/A7 and +20% on Cortex-A9; |
| |
| $flavour = shift; |
| if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } |
| else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } |
| |
| if ($flavour && $flavour ne "void") { |
| $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or |
| ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or |
| die "can't locate arm-xlate.pl"; |
| |
| open STDOUT,"| \"$^X\" $xlate $flavour $output"; |
| } else { |
| open STDOUT,">$output"; |
| } |
| |
| sub AUTOLOAD() # thunk [simplified] x86-style perlasm |
| { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; |
| my $arg = pop; |
| $arg = "#$arg" if ($arg*1 eq $arg); |
| $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; |
| } |
| |
| my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); |
| my @t=map("r$_",(8..11)); |
| |
| sub ROUND { |
| my ($a0,$b0,$c0,$d0)=@_; |
| my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); |
| my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); |
| my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); |
| my $odd = $d0&1; |
| my ($xc,$xc_) = (@t[0..1]); |
| my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); |
| my @ret; |
| |
| # Consider order in which variables are addressed by their |
| # index: |
| # |
| # a b c d |
| # |
| # 0 4 8 12 < even round |
| # 1 5 9 13 |
| # 2 6 10 14 |
| # 3 7 11 15 |
| # 0 5 10 15 < odd round |
| # 1 6 11 12 |
| # 2 7 8 13 |
| # 3 4 9 14 |
| # |
| # 'a', 'b' are permanently allocated in registers, @x[0..7], |
| # while 'c's and pair of 'd's are maintained in memory. If |
| # you observe 'c' column, you'll notice that pair of 'c's is |
| # invariant between rounds. This means that we have to reload |
| # them once per round, in the middle. This is why you'll see |
| # bunch of 'c' stores and loads in the middle, but none in |
| # the beginning or end. If you observe 'd' column, you'll |
| # notice that 15 and 13 are reused in next pair of rounds. |
| # This is why these two are chosen for offloading to memory, |
| # to make loads count more. |
| push @ret,( |
| "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')", |
| "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')", |
| "&eor ($xd,@x[$a0],$xd,'ror#24')", |
| "&eor ($xd_,@x[$a1],$xd_,'ror#24')", |
| |
| "&add ($xc,$xc,$xd,'ror#16')", |
| "&add ($xc_,$xc_,$xd_,'ror#16')", |
| "&eor (@x[$b0],$xc, @x[$b0],'ror#13')", |
| "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')", |
| |
| "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')", |
| "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')", |
| "&eor ($xd,@x[$a0],$xd,'ror#16')", |
| "&eor ($xd_,@x[$a1],$xd_,'ror#16')" ); |
| push @ret,( |
| "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd); |
| push @ret,( |
| "&add ($xc,$xc,$xd,'ror#24')" ); |
| push @ret,( |
| "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); |
| push @ret,( |
| "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd); |
| push @ret,( |
| "&add ($xc_,$xc_,$xd_,'ror#24')" ); |
| push @ret,( |
| "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); |
| push @ret,( |
| "&str ($xc,'[sp,#4*(16+$c0)]')", |
| "&eor (@x[$b0],@x[$b0],$xc,'ror#12')", |
| "&str ($xc_,'[sp,#4*(16+$c1)]')", |
| "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" ); |
| |
| $xd=@x[$d2] if (!$odd); |
| $xd_=@x[$d3] if ($odd); |
| push @ret,( |
| "&ldr ($xc,'[sp,#4*(16+$c2)]')", |
| "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')", |
| "&ldr ($xc_,'[sp,#4*(16+$c3)]')", |
| "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')", |
| "&eor ($xd,@x[$a2],$xd,'ror#24')", |
| "&eor ($xd_,@x[$a3],$xd_,'ror#24')", |
| |
| "&add ($xc,$xc,$xd,'ror#16')", |
| "&add ($xc_,$xc_,$xd_,'ror#16')", |
| "&eor (@x[$b2],$xc, @x[$b2],'ror#13')", |
| "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')", |
| |
| "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')", |
| "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')", |
| "&eor ($xd,@x[$a2],$xd,'ror#16')", |
| "&eor ($xd_,@x[$a3],$xd_,'ror#16')", |
| |
| "&add ($xc,$xc,$xd,'ror#24')", |
| "&add ($xc_,$xc_,$xd_,'ror#24')", |
| "&eor (@x[$b2],@x[$b2],$xc,'ror#12')", |
| "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" ); |
| |
| @ret; |
| } |
| |
| $code.=<<___; |
| #ifndef __KERNEL__ |
| # include "arm_arch.h" |
| #else |
| # define __ARM_ARCH__ __LINUX_ARM_ARCH__ |
| # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ |
| # define ChaCha20_ctr32 chacha20_arm_cryptogams |
| # define ChaCha20_neon chacha20_neon |
| #endif |
| |
| .text |
| #if defined(__thumb2__) || defined(__clang__) |
| .syntax unified |
| # define ldrhsb ldrbhs |
| #endif |
| #if defined(__thumb2__) |
| .thumb |
| #else |
| .code 32 |
| #endif |
| |
| .align 5 |
| .Lsigma: |
| .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral |
| .Lone: |
| .long 1,0,0,0 |
| .Lrot8: |
| .long 0x02010003,0x06050407 |
| #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) |
| .LOPENSSL_armcap: |
| .word OPENSSL_armcap_P-.LChaCha20_ctr32 |
| #else |
| .word -1 |
| #endif |
| |
| .globl ChaCha20_ctr32 |
| .type ChaCha20_ctr32,%function |
| .align 5 |
| ChaCha20_ctr32: |
| .LChaCha20_ctr32: |
| ldr r12,[sp,#0] @ pull pointer to counter and nonce |
| stmdb sp!,{r0-r2,r4-r11,lr} |
| #if __ARM_ARCH__<7 && !defined(__thumb2__) |
| sub r14,pc,#16 @ ChaCha20_ctr32 |
| #else |
| adr r14,.LChaCha20_ctr32 |
| #endif |
| cmp r2,#0 @ len==0? |
| #ifdef __thumb2__ |
| itt eq |
| #endif |
| addeq sp,sp,#4*3 |
| beq .Lno_data |
| #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) |
| cmp r2,#192 @ test len |
| bls .Lshort |
| ldr r4,[r14,#-24] |
| ldr r4,[r14,r4] |
| # ifdef __APPLE__ |
| ldr r4,[r4] |
| # endif |
| tst r4,#ARMV7_NEON |
| bne .LChaCha20_neon |
| .Lshort: |
| #endif |
| ldmia r12,{r4-r7} @ load counter and nonce |
| sub sp,sp,#4*(16) @ off-load area |
| sub r14,r14,#64 @ .Lsigma |
| stmdb sp!,{r4-r7} @ copy counter and nonce |
| ldmia r3,{r4-r11} @ load key |
| ldmia r14,{r0-r3} @ load sigma |
| stmdb sp!,{r4-r11} @ copy key |
| stmdb sp!,{r0-r3} @ copy sigma |
| str r10,[sp,#4*(16+10)] @ off-load "@x[10]" |
| str r11,[sp,#4*(16+11)] @ off-load "@x[11]" |
| b .Loop_outer_enter |
| |
| .align 4 |
| .Loop_outer: |
| ldmia sp,{r0-r9} @ load key material |
| str @t[3],[sp,#4*(32+2)] @ save len |
| str r12, [sp,#4*(32+1)] @ save inp |
| str r14, [sp,#4*(32+0)] @ save out |
| .Loop_outer_enter: |
| ldr @t[3], [sp,#4*(15)] |
| mov @x[4],@x[4],ror#19 @ twist b[0..3] |
| ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load |
| mov @x[5],@x[5],ror#19 |
| ldr @t[2], [sp,#4*(13)] |
| mov @x[6],@x[6],ror#19 |
| ldr @x[14],[sp,#4*(14)] |
| mov @x[7],@x[7],ror#19 |
| mov @t[3],@t[3],ror#8 @ twist d[0..3] |
| mov @x[12],@x[12],ror#8 |
| mov @t[2],@t[2],ror#8 |
| mov @x[14],@x[14],ror#8 |
| str @t[3], [sp,#4*(16+15)] |
| mov @t[3],#10 |
| b .Loop |
| |
| .align 4 |
| .Loop: |
| subs @t[3],@t[3],#1 |
| ___ |
| foreach (&ROUND(0, 4, 8,12)) { eval; } |
| foreach (&ROUND(0, 5,10,15)) { eval; } |
| $code.=<<___; |
| bne .Loop |
| |
| ldr @t[3],[sp,#4*(32+2)] @ load len |
| |
| str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store |
| str @t[1], [sp,#4*(16+9)] |
| str @x[12],[sp,#4*(16+12)] |
| str @t[2], [sp,#4*(16+13)] |
| str @x[14],[sp,#4*(16+14)] |
| |
| @ at this point we have first half of 512-bit result in |
| @ @x[0-7] and second half at sp+4*(16+8) |
| |
| cmp @t[3],#64 @ done yet? |
| #ifdef __thumb2__ |
| itete lo |
| #endif |
| addlo r12,sp,#4*(0) @ shortcut or ... |
| ldrhs r12,[sp,#4*(32+1)] @ ... load inp |
| addlo r14,sp,#4*(0) @ shortcut or ... |
| ldrhs r14,[sp,#4*(32+0)] @ ... load out |
| |
| ldr @t[0],[sp,#4*(0)] @ load key material |
| ldr @t[1],[sp,#4*(1)] |
| |
| #if __ARM_ARCH__>=6 || !defined(__ARMEB__) |
| # if __ARM_ARCH__<7 |
| orr @t[2],r12,r14 |
| tst @t[2],#3 @ are input and output aligned? |
| ldr @t[2],[sp,#4*(2)] |
| bne .Lunaligned |
| cmp @t[3],#64 @ restore flags |
| # else |
| ldr @t[2],[sp,#4*(2)] |
| # endif |
| ldr @t[3],[sp,#4*(3)] |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @x[1],@x[1],@t[1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[0],@x[0],@t[0] @ xor with input |
| eorhs @x[1],@x[1],@t[1] |
| add @t[0],sp,#4*(4) |
| str @x[0],[r14],#16 @ store output |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[2],@x[2],@t[2] |
| eorhs @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[1],[r14,#-12] |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@t[0],@x[4],ror#13 @ accumulate key material |
| add @x[5],@t[1],@x[5],ror#13 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| add @x[6],@t[2],@x[6],ror#13 |
| add @x[7],@t[3],@x[7],ror#13 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[4],@x[4],@t[0] |
| eorhs @x[5],@x[5],@t[1] |
| add @t[0],sp,#4*(8) |
| str @x[4],[r14],#16 @ store output |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[6],@x[6],@t[2] |
| eorhs @x[7],@x[7],@t[3] |
| str @x[5],[r14,#-12] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[6],[r14,#-8] |
| add @x[0],sp,#4*(16+8) |
| str @x[7],[r14,#-4] |
| |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @x[1],@x[1],@t[1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it |
| strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[0],@x[0],@t[0] |
| eorhs @x[1],@x[1],@t[1] |
| add @t[0],sp,#4*(12) |
| str @x[0],[r14],#16 @ store output |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[2],@x[2],@t[2] |
| eorhs @x[3],@x[3],@t[3] |
| str @x[1],[r14,#-12] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@t[0],@x[4],ror#24 @ accumulate key material |
| add @x[5],@t[1],@x[5],ror#24 |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| addhi @t[0],@t[0],#1 @ next counter value |
| strhi @t[0],[sp,#4*(12)] @ save next counter value |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[0],[r12],#16 @ load input |
| ldrhs @t[1],[r12,#-12] |
| add @x[6],@t[2],@x[6],ror#24 |
| add @x[7],@t[3],@x[7],ror#24 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhs @t[2],[r12,#-8] |
| ldrhs @t[3],[r12,#-4] |
| # if __ARM_ARCH__>=6 && defined(__ARMEB__) |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[4],@x[4],@t[0] |
| eorhs @x[5],@x[5],@t[1] |
| # ifdef __thumb2__ |
| it ne |
| # endif |
| ldrne @t[0],[sp,#4*(32+2)] @ re-load len |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| eorhs @x[6],@x[6],@t[2] |
| eorhs @x[7],@x[7],@t[3] |
| str @x[4],[r14],#16 @ store output |
| str @x[5],[r14,#-12] |
| # ifdef __thumb2__ |
| it hs |
| # endif |
| subhs @t[3],@t[0],#64 @ len-=64 |
| str @x[6],[r14,#-8] |
| str @x[7],[r14,#-4] |
| bhi .Loop_outer |
| |
| beq .Ldone |
| # if __ARM_ARCH__<7 |
| b .Ltail |
| |
| .align 4 |
| .Lunaligned: @ unaligned endian-neutral path |
| cmp @t[3],#64 @ restore flags |
| # endif |
| #endif |
| #if __ARM_ARCH__<7 |
| ldr @t[3],[sp,#4*(3)] |
| ___ |
| for ($i=0;$i<16;$i+=4) { |
| my $j=$i&0x7; |
| my $twist=""; |
| if ($i==4) { $twist = ",ror#13"; } |
| elsif ($i==12) { $twist = ",ror#24"; } |
| |
| $code.=<<___ if ($i==4); |
| add @x[0],sp,#4*(16+8) |
| ___ |
| $code.=<<___ if ($i==8); |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" |
| strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" |
| ___ |
| $code.=<<___; |
| add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material |
| ___ |
| $code.=<<___ if ($i==12); |
| # ifdef __thumb2__ |
| itt hi |
| # endif |
| addhi @t[0],@t[0],#1 @ next counter value |
| strhi @t[0],[sp,#4*(12)] @ save next counter value |
| ___ |
| $code.=<<___; |
| add @x[$j+1],@t[1],@x[$j+1]$twist |
| add @x[$j+2],@t[2],@x[$j+2]$twist |
| # ifdef __thumb2__ |
| itete lo |
| # endif |
| eorlo @t[0],@t[0],@t[0] @ zero or ... |
| ldrhsb @t[0],[r12],#16 @ ... load input |
| eorlo @t[1],@t[1],@t[1] |
| ldrhsb @t[1],[r12,#-12] |
| |
| add @x[$j+3],@t[3],@x[$j+3]$twist |
| # ifdef __thumb2__ |
| itete lo |
| # endif |
| eorlo @t[2],@t[2],@t[2] |
| ldrhsb @t[2],[r12,#-8] |
| eorlo @t[3],@t[3],@t[3] |
| ldrhsb @t[3],[r12,#-4] |
| |
| eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) |
| eor @x[$j+1],@t[1],@x[$j+1] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[0],[r12,#-15] @ load more input |
| ldrhsb @t[1],[r12,#-11] |
| eor @x[$j+2],@t[2],@x[$j+2] |
| strb @x[$j+0],[r14],#16 @ store output |
| eor @x[$j+3],@t[3],@x[$j+3] |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[2],[r12,#-7] |
| ldrhsb @t[3],[r12,#-3] |
| strb @x[$j+1],[r14,#-12] |
| eor @x[$j+0],@t[0],@x[$j+0],lsr#8 |
| strb @x[$j+2],[r14,#-8] |
| eor @x[$j+1],@t[1],@x[$j+1],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[0],[r12,#-14] @ load more input |
| ldrhsb @t[1],[r12,#-10] |
| strb @x[$j+3],[r14,#-4] |
| eor @x[$j+2],@t[2],@x[$j+2],lsr#8 |
| strb @x[$j+0],[r14,#-15] |
| eor @x[$j+3],@t[3],@x[$j+3],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[2],[r12,#-6] |
| ldrhsb @t[3],[r12,#-2] |
| strb @x[$j+1],[r14,#-11] |
| eor @x[$j+0],@t[0],@x[$j+0],lsr#8 |
| strb @x[$j+2],[r14,#-7] |
| eor @x[$j+1],@t[1],@x[$j+1],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[0],[r12,#-13] @ load more input |
| ldrhsb @t[1],[r12,#-9] |
| strb @x[$j+3],[r14,#-3] |
| eor @x[$j+2],@t[2],@x[$j+2],lsr#8 |
| strb @x[$j+0],[r14,#-14] |
| eor @x[$j+3],@t[3],@x[$j+3],lsr#8 |
| # ifdef __thumb2__ |
| itt hs |
| # endif |
| ldrhsb @t[2],[r12,#-5] |
| ldrhsb @t[3],[r12,#-1] |
| strb @x[$j+1],[r14,#-10] |
| strb @x[$j+2],[r14,#-6] |
| eor @x[$j+0],@t[0],@x[$j+0],lsr#8 |
| strb @x[$j+3],[r14,#-2] |
| eor @x[$j+1],@t[1],@x[$j+1],lsr#8 |
| strb @x[$j+0],[r14,#-13] |
| eor @x[$j+2],@t[2],@x[$j+2],lsr#8 |
| strb @x[$j+1],[r14,#-9] |
| eor @x[$j+3],@t[3],@x[$j+3],lsr#8 |
| strb @x[$j+2],[r14,#-5] |
| strb @x[$j+3],[r14,#-1] |
| ___ |
| $code.=<<___ if ($i<12); |
| add @t[0],sp,#4*(4+$i) |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| ___ |
| } |
| $code.=<<___; |
| # ifdef __thumb2__ |
| it ne |
| # endif |
| ldrne @t[0],[sp,#4*(32+2)] @ re-load len |
| # ifdef __thumb2__ |
| it hs |
| # endif |
| subhs @t[3],@t[0],#64 @ len-=64 |
| bhi .Loop_outer |
| |
| beq .Ldone |
| #endif |
| |
| .Ltail: |
| ldr r12,[sp,#4*(32+1)] @ load inp |
| add @t[1],sp,#4*(0) |
| ldr r14,[sp,#4*(32+0)] @ load out |
| |
| .Loop_tail: |
| ldrb @t[2],[@t[1]],#1 @ read buffer on stack |
| ldrb @t[3],[r12],#1 @ read input |
| subs @t[0],@t[0],#1 |
| eor @t[3],@t[3],@t[2] |
| strb @t[3],[r14],#1 @ store output |
| bne .Loop_tail |
| |
| .Ldone: |
| add sp,sp,#4*(32+3) |
| .Lno_data: |
| #if __ARM_ARCH__>=5 |
| ldmia sp!,{r4-r11,pc} |
| #else |
| ldmia sp!,{r4-r12,lr} |
| tst lr,#1 |
| moveq pc,lr @ be binary compatible with V4, yet |
| .long 0xe12fff1e @ interoperable with Thumb ISA:-) |
| #endif |
| .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
| ___ |
| |
| {{{ |
| my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = |
| map("q$_",(0..15)); |
| |
| # This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on |
| # Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%! |
| sub vperm() |
| { my ($dst,$src,$tbl) = @_; |
| $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n"; |
| $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n"; |
| } |
| |
| sub NEONROUND { |
| my $odd = pop; |
| my ($a,$b,$c,$d,$t)=@_; |
| |
| ( |
| "&vadd_i32 ($a,$a,$b)", |
| "&veor ($d,$d,$a)", |
| "&vrev32_16 ($d,$d)", # vrot ($d,16) |
| |
| "&vadd_i32 ($c,$c,$d)", |
| "&veor ($t,$b,$c)", |
| "&vshr_u32 ($b,$t,20)", |
| "&vsli_32 ($b,$t,12)", |
| |
| "&vadd_i32 ($a,$a,$b)", |
| "&veor ($t,$d,$a)", |
| "&vshr_u32 ($d,$t,24)", |
| "&vsli_32 ($d,$t,8)", |
| #"&vperm ($d,$t,$t3)", |
| |
| "&vadd_i32 ($c,$c,$d)", |
| "&veor ($t,$b,$c)", |
| "&vshr_u32 ($b,$t,25)", |
| "&vsli_32 ($b,$t,7)", |
| |
| "&vext_8 ($a,$a,$a,$odd?4:12)", |
| "&vext_8 ($d,$d,$d,8)", |
| "&vext_8 ($c,$c,$c,$odd?12:4)" |
| ); |
| } |
| |
| $code.=<<___; |
| #if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7) |
| .arch armv7-a |
| .fpu neon |
| |
| # ifdef __KERNEL__ |
| .globl ChaCha20_neon |
| @ For optimal performance it's appropriate for caller to enforce |
| @ minimum input length, 193 bytes is suggested. |
| # endif |
| .type ChaCha20_neon,%function |
| .align 5 |
| ChaCha20_neon: |
| ldr r12,[sp,#0] @ pull pointer to counter and nonce |
| stmdb sp!,{r0-r2,r4-r11,lr} |
| .LChaCha20_neon: |
| adr r14,.Lsigma |
| vstmdb sp!,{d8-d15} @ ABI spec says so |
| stmdb sp!,{r0-r3} |
| |
| vld1.32 {$b0-$c0},[r3] @ load key |
| ldmia r3,{r4-r11} @ load key |
| |
| sub sp,sp,#4*(16+16) |
| vld1.32 {$d0},[r12] @ load counter and nonce |
| add r12,sp,#4*8 |
| ldmia r14,{r0-r3} @ load sigma |
| vld1.32 {$a0},[r14]! @ load sigma |
| vld1.32 {$t0},[r14]! @ one |
| @ vld1.32 {$t3#lo},[r14] @ rot8 |
| vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce |
| vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key |
| |
| str r10,[sp,#4*(16+10)] @ off-load "@x[10]" |
| str r11,[sp,#4*(16+11)] @ off-load "@x[11]" |
| vshl.i32 $t1#lo,$t0#lo,#1 @ two |
| vstr $t0#lo,[sp,#4*(16+0)] |
| vshl.i32 $t2#lo,$t0#lo,#2 @ four |
| vstr $t1#lo,[sp,#4*(16+2)] |
| vmov $a1,$a0 |
| vstr $t2#lo,[sp,#4*(16+4)] |
| vmov $a2,$a0 |
| @ vstr $t3#lo,[sp,#4*(16+6)] |
| vmov $b1,$b0 |
| vmov $b2,$b0 |
| b .Loop_neon_enter |
| |
| .align 4 |
| .Loop_neon_outer: |
| ldmia sp,{r0-r9} @ load key material |
| cmp @t[3],#64*2 @ if len<=64*2 |
| bls .Lbreak_neon @ switch to integer-only |
| @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8 |
| vmov $a1,$a0 |
| str @t[3],[sp,#4*(32+2)] @ save len |
| vmov $a2,$a0 |
| str r12, [sp,#4*(32+1)] @ save inp |
| vmov $b1,$b0 |
| str r14, [sp,#4*(32+0)] @ save out |
| vmov $b2,$b0 |
| .Loop_neon_enter: |
| ldr @t[3], [sp,#4*(15)] |
| mov @x[4],@x[4],ror#19 @ twist b[0..3] |
| vadd.i32 $d1,$d0,$t0 @ counter+1 |
| ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load |
| mov @x[5],@x[5],ror#19 |
| vmov $c1,$c0 |
| ldr @t[2], [sp,#4*(13)] |
| mov @x[6],@x[6],ror#19 |
| vmov $c2,$c0 |
| ldr @x[14],[sp,#4*(14)] |
| mov @x[7],@x[7],ror#19 |
| vadd.i32 $d2,$d1,$t0 @ counter+2 |
| add @x[12],@x[12],#3 @ counter+3 |
| mov @t[3],@t[3],ror#8 @ twist d[0..3] |
| mov @x[12],@x[12],ror#8 |
| mov @t[2],@t[2],ror#8 |
| mov @x[14],@x[14],ror#8 |
| str @t[3], [sp,#4*(16+15)] |
| mov @t[3],#10 |
| b .Loop_neon |
| |
| .align 4 |
| .Loop_neon: |
| subs @t[3],@t[3],#1 |
| ___ |
| my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); |
| my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); |
| my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); |
| my @thread3=&ROUND(0,4,8,12); |
| |
| foreach (@thread0) { |
| eval; eval(shift(@thread3)); |
| eval(shift(@thread1)); eval(shift(@thread3)); |
| eval(shift(@thread2)); eval(shift(@thread3)); |
| } |
| |
| @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); |
| @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); |
| @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); |
| @thread3=&ROUND(0,5,10,15); |
| |
| foreach (@thread0) { |
| eval; eval(shift(@thread3)); |
| eval(shift(@thread1)); eval(shift(@thread3)); |
| eval(shift(@thread2)); eval(shift(@thread3)); |
| } |
| $code.=<<___; |
| bne .Loop_neon |
| |
| add @t[3],sp,#32 |
| vld1.32 {$t0-$t1},[sp] @ load key material |
| vld1.32 {$t2-$t3},[@t[3]] |
| |
| ldr @t[3],[sp,#4*(32+2)] @ load len |
| |
| str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store |
| str @t[1], [sp,#4*(16+9)] |
| str @x[12],[sp,#4*(16+12)] |
| str @t[2], [sp,#4*(16+13)] |
| str @x[14],[sp,#4*(16+14)] |
| |
| @ at this point we have first half of 512-bit result in |
| @ @x[0-7] and second half at sp+4*(16+8) |
| |
| ldr r12,[sp,#4*(32+1)] @ load inp |
| ldr r14,[sp,#4*(32+0)] @ load out |
| |
| vadd.i32 $a0,$a0,$t0 @ accumulate key material |
| vadd.i32 $a1,$a1,$t0 |
| vadd.i32 $a2,$a2,$t0 |
| vldr $t0#lo,[sp,#4*(16+0)] @ one |
| |
| vadd.i32 $b0,$b0,$t1 |
| vadd.i32 $b1,$b1,$t1 |
| vadd.i32 $b2,$b2,$t1 |
| vldr $t1#lo,[sp,#4*(16+2)] @ two |
| |
| vadd.i32 $c0,$c0,$t2 |
| vadd.i32 $c1,$c1,$t2 |
| vadd.i32 $c2,$c2,$t2 |
| vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 |
| vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 |
| |
| vadd.i32 $d0,$d0,$t3 |
| vadd.i32 $d1,$d1,$t3 |
| vadd.i32 $d2,$d2,$t3 |
| |
| cmp @t[3],#64*4 |
| blo .Ltail_neon |
| |
| vld1.8 {$t0-$t1},[r12]! @ load input |
| mov @t[3],sp |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 @ xor with input |
| veor $b0,$b0,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a1,$a1,$t0 |
| vst1.8 {$a0-$b0},[r14]! @ store output |
| veor $b1,$b1,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c1,$c1,$t2 |
| vst1.8 {$c0-$d0},[r14]! |
| veor $d1,$d1,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a2,$a2,$t0 |
| vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration |
| veor $t0#hi,$t0#hi,$t0#hi |
| vldr $t0#lo,[sp,#4*(16+4)] @ four |
| veor $b2,$b2,$t1 |
| vld1.32 {$c0-$d0},[@t[3]] |
| veor $c2,$c2,$t2 |
| vst1.8 {$a1-$b1},[r14]! |
| veor $d2,$d2,$t3 |
| vst1.8 {$c1-$d1},[r14]! |
| |
| vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value |
| vldr $t0#lo,[sp,#4*(16+0)] @ one |
| |
| ldmia sp,{@t[0]-@t[3]} @ load key material |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| ldr @t[0],[r12],#16 @ load input |
| vst1.8 {$a2-$b2},[r14]! |
| add @x[1],@x[1],@t[1] |
| ldr @t[1],[r12,#-12] |
| vst1.8 {$c2-$d2},[r14]! |
| add @x[2],@x[2],@t[2] |
| ldr @t[2],[r12,#-8] |
| add @x[3],@x[3],@t[3] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| eor @x[0],@x[0],@t[0] @ xor with input |
| add @t[0],sp,#4*(4) |
| eor @x[1],@x[1],@t[1] |
| str @x[0],[r14],#16 @ store output |
| eor @x[2],@x[2],@t[2] |
| str @x[1],[r14,#-12] |
| eor @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@t[0],@x[4],ror#13 @ accumulate key material |
| ldr @t[0],[r12],#16 @ load input |
| add @x[5],@t[1],@x[5],ror#13 |
| ldr @t[1],[r12,#-12] |
| add @x[6],@t[2],@x[6],ror#13 |
| ldr @t[2],[r12,#-8] |
| add @x[7],@t[3],@x[7],ror#13 |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| eor @x[4],@x[4],@t[0] |
| add @t[0],sp,#4*(8) |
| eor @x[5],@x[5],@t[1] |
| str @x[4],[r14],#16 @ store output |
| eor @x[6],@x[6],@t[2] |
| str @x[5],[r14,#-12] |
| eor @x[7],@x[7],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[6],[r14,#-8] |
| add @x[0],sp,#4*(16+8) |
| str @x[7],[r14,#-4] |
| |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| ldr @t[0],[r12],#16 @ load input |
| add @x[1],@x[1],@t[1] |
| ldr @t[1],[r12,#-12] |
| # ifdef __thumb2__ |
| it hi |
| # endif |
| strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it |
| add @x[2],@x[2],@t[2] |
| ldr @t[2],[r12,#-8] |
| # ifdef __thumb2__ |
| it hi |
| # endif |
| strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it |
| add @x[3],@x[3],@t[3] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| # endif |
| eor @x[0],@x[0],@t[0] |
| add @t[0],sp,#4*(12) |
| eor @x[1],@x[1],@t[1] |
| str @x[0],[r14],#16 @ store output |
| eor @x[2],@x[2],@t[2] |
| str @x[1],[r14,#-12] |
| eor @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| str @x[2],[r14,#-8] |
| str @x[3],[r14,#-4] |
| |
| add @x[4],@t[0],@x[4],ror#24 @ accumulate key material |
| add @t[0],@t[0],#4 @ next counter value |
| add @x[5],@t[1],@x[5],ror#24 |
| str @t[0],[sp,#4*(12)] @ save next counter value |
| ldr @t[0],[r12],#16 @ load input |
| add @x[6],@t[2],@x[6],ror#24 |
| add @x[4],@x[4],#3 @ counter+3 |
| ldr @t[1],[r12,#-12] |
| add @x[7],@t[3],@x[7],ror#24 |
| ldr @t[2],[r12,#-8] |
| ldr @t[3],[r12,#-4] |
| # ifdef __ARMEB__ |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| eor @x[4],@x[4],@t[0] |
| # ifdef __thumb2__ |
| it hi |
| # endif |
| ldrhi @t[0],[sp,#4*(32+2)] @ re-load len |
| eor @x[5],@x[5],@t[1] |
| eor @x[6],@x[6],@t[2] |
| str @x[4],[r14],#16 @ store output |
| eor @x[7],@x[7],@t[3] |
| str @x[5],[r14,#-12] |
| sub @t[3],@t[0],#64*4 @ len-=64*4 |
| str @x[6],[r14,#-8] |
| str @x[7],[r14,#-4] |
| bhi .Loop_neon_outer |
| |
| b .Ldone_neon |
| |
| .align 4 |
| .Lbreak_neon: |
| @ harmonize NEON and integer-only stack frames: load data |
| @ from NEON frame, but save to integer-only one; distance |
| @ between the two is 4*(32+4+16-32)=4*(20). |
| |
| str @t[3], [sp,#4*(20+32+2)] @ save len |
| add @t[3],sp,#4*(32+4) |
| str r12, [sp,#4*(20+32+1)] @ save inp |
| str r14, [sp,#4*(20+32+0)] @ save out |
| |
| ldr @x[12],[sp,#4*(16+10)] |
| ldr @x[14],[sp,#4*(16+11)] |
| vldmia @t[3],{d8-d15} @ fulfill ABI requirement |
| str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" |
| str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" |
| |
| ldr @t[3], [sp,#4*(15)] |
| mov @x[4],@x[4],ror#19 @ twist b[0..3] |
| ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load |
| mov @x[5],@x[5],ror#19 |
| ldr @t[2], [sp,#4*(13)] |
| mov @x[6],@x[6],ror#19 |
| ldr @x[14],[sp,#4*(14)] |
| mov @x[7],@x[7],ror#19 |
| mov @t[3],@t[3],ror#8 @ twist d[0..3] |
| mov @x[12],@x[12],ror#8 |
| mov @t[2],@t[2],ror#8 |
| mov @x[14],@x[14],ror#8 |
| str @t[3], [sp,#4*(20+16+15)] |
| add @t[3],sp,#4*(20) |
| vst1.32 {$a0-$b0},[@t[3]]! @ copy key |
| add sp,sp,#4*(20) @ switch frame |
| vst1.32 {$c0-$d0},[@t[3]] |
| mov @t[3],#10 |
| b .Loop @ go integer-only |
| |
| .align 4 |
| .Ltail_neon: |
| cmp @t[3],#64*3 |
| bhs .L192_or_more_neon |
| cmp @t[3],#64*2 |
| bhs .L128_or_more_neon |
| cmp @t[3],#64*1 |
| bhs .L64_or_more_neon |
| |
| add @t[0],sp,#4*(8) |
| vst1.8 {$a0-$b0},[sp] |
| add @t[2],sp,#4*(0) |
| vst1.8 {$c0-$d0},[@t[0]] |
| b .Loop_tail_neon |
| |
| .align 4 |
| .L64_or_more_neon: |
| vld1.8 {$t0-$t1},[r12]! |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 |
| veor $b0,$b0,$t1 |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vst1.8 {$a0-$b0},[r14]! |
| vst1.8 {$c0-$d0},[r14]! |
| |
| beq .Ldone_neon |
| |
| add @t[0],sp,#4*(8) |
| vst1.8 {$a1-$b1},[sp] |
| add @t[2],sp,#4*(0) |
| vst1.8 {$c1-$d1},[@t[0]] |
| sub @t[3],@t[3],#64*1 @ len-=64*1 |
| b .Loop_tail_neon |
| |
| .align 4 |
| .L128_or_more_neon: |
| vld1.8 {$t0-$t1},[r12]! |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 |
| veor $b0,$b0,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a1,$a1,$t0 |
| veor $b1,$b1,$t1 |
| vst1.8 {$a0-$b0},[r14]! |
| veor $c1,$c1,$t2 |
| vst1.8 {$c0-$d0},[r14]! |
| veor $d1,$d1,$t3 |
| vst1.8 {$a1-$b1},[r14]! |
| vst1.8 {$c1-$d1},[r14]! |
| |
| beq .Ldone_neon |
| |
| add @t[0],sp,#4*(8) |
| vst1.8 {$a2-$b2},[sp] |
| add @t[2],sp,#4*(0) |
| vst1.8 {$c2-$d2},[@t[0]] |
| sub @t[3],@t[3],#64*2 @ len-=64*2 |
| b .Loop_tail_neon |
| |
| .align 4 |
| .L192_or_more_neon: |
| vld1.8 {$t0-$t1},[r12]! |
| vld1.8 {$t2-$t3},[r12]! |
| veor $a0,$a0,$t0 |
| veor $b0,$b0,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c0,$c0,$t2 |
| veor $d0,$d0,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a1,$a1,$t0 |
| veor $b1,$b1,$t1 |
| vld1.8 {$t0-$t1},[r12]! |
| veor $c1,$c1,$t2 |
| vst1.8 {$a0-$b0},[r14]! |
| veor $d1,$d1,$t3 |
| vld1.8 {$t2-$t3},[r12]! |
| |
| veor $a2,$a2,$t0 |
| vst1.8 {$c0-$d0},[r14]! |
| veor $b2,$b2,$t1 |
| vst1.8 {$a1-$b1},[r14]! |
| veor $c2,$c2,$t2 |
| vst1.8 {$c1-$d1},[r14]! |
| veor $d2,$d2,$t3 |
| vst1.8 {$a2-$b2},[r14]! |
| vst1.8 {$c2-$d2},[r14]! |
| |
| beq .Ldone_neon |
| |
| ldmia sp,{@t[0]-@t[3]} @ load key material |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @t[0],sp,#4*(4) |
| add @x[1],@x[1],@t[1] |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| |
| add @x[4],@t[0],@x[4],ror#13 @ accumulate key material |
| add @t[0],sp,#4*(8) |
| add @x[5],@t[1],@x[5],ror#13 |
| add @x[6],@t[2],@x[6],ror#13 |
| add @x[7],@t[3],@x[7],ror#13 |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| stmia sp,{@x[0]-@x[7]} |
| add @x[0],sp,#4*(16+8) |
| |
| ldmia @x[0],{@x[0]-@x[7]} @ load second half |
| |
| add @x[0],@x[0],@t[0] @ accumulate key material |
| add @t[0],sp,#4*(12) |
| add @x[1],@x[1],@t[1] |
| add @x[2],@x[2],@t[2] |
| add @x[3],@x[3],@t[3] |
| ldmia @t[0],{@t[0]-@t[3]} @ load key material |
| |
| add @x[4],@t[0],@x[4],ror#24 @ accumulate key material |
| add @t[0],sp,#4*(8) |
| add @x[5],@t[1],@x[5],ror#24 |
| add @x[4],@x[4],#3 @ counter+3 |
| add @x[6],@t[2],@x[6],ror#24 |
| add @x[7],@t[3],@x[7],ror#24 |
| ldr @t[3],[sp,#4*(32+2)] @ re-load len |
| # ifdef __ARMEB__ |
| rev @x[0],@x[0] |
| rev @x[1],@x[1] |
| rev @x[2],@x[2] |
| rev @x[3],@x[3] |
| rev @x[4],@x[4] |
| rev @x[5],@x[5] |
| rev @x[6],@x[6] |
| rev @x[7],@x[7] |
| # endif |
| stmia @t[0],{@x[0]-@x[7]} |
| add @t[2],sp,#4*(0) |
| sub @t[3],@t[3],#64*3 @ len-=64*3 |
| |
| .Loop_tail_neon: |
| ldrb @t[0],[@t[2]],#1 @ read buffer on stack |
| ldrb @t[1],[r12],#1 @ read input |
| subs @t[3],@t[3],#1 |
| eor @t[0],@t[0],@t[1] |
| strb @t[0],[r14],#1 @ store output |
| bne .Loop_tail_neon |
| |
| .Ldone_neon: |
| add sp,sp,#4*(32+4) |
| vldmia sp,{d8-d15} |
| add sp,sp,#4*(16+3) |
| ldmia sp!,{r4-r11,pc} |
| .size ChaCha20_neon,.-ChaCha20_neon |
| # ifndef __KERNEL__ |
| .comm OPENSSL_armcap_P,4,4 |
| # endif |
| #endif |
| ___ |
| }}} |
| |
| open SELF,$0; |
| while(<SELF>) { |
| next if (/^#!/); |
| last if (!s/^#/@/ and !/^$/); |
| print; |
| } |
| close SELF; |
| |
| foreach (split("\n",$code)) { |
| s/\`([^\`]*)\`/eval $1/geo; |
| |
| s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; |
| |
| print $_,"\n"; |
| } |
| close STDOUT; |