net/wireguard/crypto/zinc/chacha20/chacha20-arm.pl - LeafOS-Devices/android_kernel_samsung_universal7904 - Gitiles

 #!/usr/bin/env perl
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 #
 # This code is taken from the OpenSSL project but the author, Andy Polyakov,
 # has relicensed it under the licenses specified in the SPDX header above.
 # The original headers, including the original license headers, are
 # included below for completeness.
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # December 2014
 #
 # ChaCha20 for ARMv4.
 #
 # September 2018
 #
 # Improve scalar performance per Eric Biggers' suggestion to eliminate
 # separate rotates. This requires b[0..3] and d[0..3] to be maintained
 # pre-rotated, hence odd twists prior inner loop and when accumulating
 # key material. Since amount of instructions is reduced as result, even
 # NEON performance is improved somewhat, most notably by ~9% on low-end
 # Cortex-A5/A7. Full unroll was shown to provide even better scalar
 # performance on Cortex-A5/A7, naturally at the cost of manyfold size
 # increase. We let it be. Oversized code works in benchmarks, but is not
 # necessarily optimal in real life, when it's likely to be out-of-cache
 # upon entry and evict significant part of cache upon completion.
 #
 # Performance in cycles per byte out of large buffer.
 #
 #			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
 #
 # Cortex-A5		14.2(*)/+160%   21.8        12.9(**)
 # Cortex-A8		10.2(*)/+190%   13.9        6.10
 # Cortex-A9		10.8(*)/+150%   14.3        6.50
 # Cortex-A15		11.0/+40%       16.0        4.90
 # Snapdragon S4		13.9(***)/+90%  13.6        4.90
 #
 # (*)	most "favourable" result for aligned data on little-endian
 #	processor, result for misaligned data is 10-15% lower;
 # (**)	pure 4xNEON [with "vertical" layout] was shown to provide ~8%
 #	better performance on Cortex-A5/A7, but not on others;
 # (***)	it's 17% slower than original, trade-off is considered
 #	acceptable, because of improvement on others, specifically
 #	+36% on Cortex-A5/A7 and +20% on Cortex-A9;

 $flavour = shift;
 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }

 if ($flavour && $flavour ne "void") {
     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     die "can't locate arm-xlate.pl";

     open STDOUT,"| \"$^X\" $xlate $flavour $output";
 } else {
     open STDOUT,">$output";
 }

 sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
   my $arg = pop;
     $arg = "#$arg" if ($arg*1 eq $arg);
     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
 }

 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
 my @t=map("r$_",(8..11));

 sub ROUND {
 my ($a0,$b0,$c0,$d0)=@_;
 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
 my $odd = $d0&1;
 my ($xc,$xc_) = (@t[0..1]);
 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
 my @ret;

 	# Consider order in which variables are addressed by their
 	# index:
 	#
 	#       a   b   c   d
 	#
 	#       0   4   8  12 < even round
 	#       1   5   9  13
 	#       2   6  10  14
 	#       3   7  11  15
 	#       0   5  10  15 < odd round
 	#       1   6  11  12
 	#       2   7   8  13
 	#       3   4   9  14
 	#
 	# 'a', 'b' are permanently allocated in registers, @x[0..7],
 	# while 'c's and pair of 'd's are maintained in memory. If
 	# you observe 'c' column, you'll notice that pair of 'c's is
 	# invariant between rounds. This means that we have to reload
 	# them once per round, in the middle. This is why you'll see
 	# bunch of 'c' stores and loads in the middle, but none in
 	# the beginning or end. If you observe 'd' column, you'll
 	# notice that 15 and 13 are reused in next pair of rounds.
 	# This is why these two are chosen for offloading to memory,
 	# to make loads count more.
 							push @ret,(
 	"&add	(@x[$a0],@x[$a0],@x[$b0],'ror#13')",
 	 "&add	(@x[$a1],@x[$a1],@x[$b1],'ror#13')",
 	"&eor	($xd,@x[$a0],$xd,'ror#24')",
 	 "&eor	($xd_,@x[$a1],$xd_,'ror#24')",

 	"&add	($xc,$xc,$xd,'ror#16')",
 	 "&add	($xc_,$xc_,$xd_,'ror#16')",
 	"&eor	(@x[$b0],$xc, @x[$b0],'ror#13')",
 	 "&eor	(@x[$b1],$xc_,@x[$b1],'ror#13')",

 	"&add	(@x[$a0],@x[$a0],@x[$b0],'ror#20')",
 	 "&add	(@x[$a1],@x[$a1],@x[$b1],'ror#20')",
 	"&eor	($xd,@x[$a0],$xd,'ror#16')",
 	 "&eor	($xd_,@x[$a1],$xd_,'ror#16')"		);
 							push @ret,(
 	"&str	($xd,'[sp,#4*(16+$d0)]')"		) if ($odd);
 							push @ret,(
 	"&add	($xc,$xc,$xd,'ror#24')"			);
 							push @ret,(
 	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
 							push @ret,(
 	 "&str	($xd_,'[sp,#4*(16+$d1)]')"		) if (!$odd);
 							push @ret,(
 	 "&add	($xc_,$xc_,$xd_,'ror#24')"		);
 							push @ret,(
 	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
 							push @ret,(
 	"&str	($xc,'[sp,#4*(16+$c0)]')",
 	"&eor	(@x[$b0],@x[$b0],$xc,'ror#12')",
 	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
 	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#12')"	);

 	$xd=@x[$d2]					if (!$odd);
 	$xd_=@x[$d3]					if ($odd);
 							push @ret,(
 	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
 	"&add	(@x[$a2],@x[$a2],@x[$b2],'ror#13')",
 	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
 	 "&add	(@x[$a3],@x[$a3],@x[$b3],'ror#13')",
 	"&eor	($xd,@x[$a2],$xd,'ror#24')",
 	 "&eor	($xd_,@x[$a3],$xd_,'ror#24')",

 	"&add	($xc,$xc,$xd,'ror#16')",
 	 "&add	($xc_,$xc_,$xd_,'ror#16')",
 	"&eor	(@x[$b2],$xc, @x[$b2],'ror#13')",
 	 "&eor	(@x[$b3],$xc_,@x[$b3],'ror#13')",

 	"&add	(@x[$a2],@x[$a2],@x[$b2],'ror#20')",
 	 "&add	(@x[$a3],@x[$a3],@x[$b3],'ror#20')",
 	"&eor	($xd,@x[$a2],$xd,'ror#16')",
 	 "&eor	($xd_,@x[$a3],$xd_,'ror#16')",

 	"&add	($xc,$xc,$xd,'ror#24')",
 	 "&add	($xc_,$xc_,$xd_,'ror#24')",
 	"&eor	(@x[$b2],@x[$b2],$xc,'ror#12')",
 	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#12')"	);

 	@ret;
 }

 $code.=<<___;
 #ifndef __KERNEL__
 # include "arm_arch.h"
 #else
 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
 # define ChaCha20_ctr32 chacha20_arm_cryptogams
 # define ChaCha20_neon  chacha20_neon
 #endif

 .text
 #if defined(__thumb2__) || defined(__clang__)
 .syntax	unified
 # define ldrhsb	ldrbhs
 #endif
 #if defined(__thumb2__)
 .thumb
 #else
 .code	32
 #endif

 .align	5
 .Lsigma:
 .long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
 .Lone:
 .long	1,0,0,0
 .Lrot8:
 .long	0x02010003,0x06050407
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
 .word   OPENSSL_armcap_P-.LChaCha20_ctr32
 #else
 .word	-1
 #endif

 .globl	ChaCha20_ctr32
 .type	ChaCha20_ctr32,%function
 .align	5
 ChaCha20_ctr32:
 .LChaCha20_ctr32:
 	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
 	stmdb	sp!,{r0-r2,r4-r11,lr}
 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r14,pc,#16		@ ChaCha20_ctr32
 #else
 	adr	r14,.LChaCha20_ctr32
 #endif
 	cmp	r2,#0			@ len==0?
 #ifdef	__thumb2__
 	itt	eq
 #endif
 	addeq	sp,sp,#4*3
 	beq	.Lno_data
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	cmp	r2,#192			@ test len
 	bls	.Lshort
 	ldr	r4,[r14,#-24]
 	ldr	r4,[r14,r4]
 # ifdef	__APPLE__
 	ldr	r4,[r4]
 # endif
 	tst	r4,#ARMV7_NEON
 	bne	.LChaCha20_neon
 .Lshort:
 #endif
 	ldmia	r12,{r4-r7}		@ load counter and nonce
 	sub	sp,sp,#4*(16)		@ off-load area
 	sub	r14,r14,#64		@ .Lsigma
 	stmdb	sp!,{r4-r7}		@ copy counter and nonce
 	ldmia	r3,{r4-r11}		@ load key
 	ldmia	r14,{r0-r3}		@ load sigma
 	stmdb	sp!,{r4-r11}		@ copy key
 	stmdb	sp!,{r0-r3}		@ copy sigma
 	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
 	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
 	b	.Loop_outer_enter

 .align	4
 .Loop_outer:
 	ldmia	sp,{r0-r9}		@ load key material
 	str	@t[3],[sp,#4*(32+2)]	@ save len
 	str	r12,  [sp,#4*(32+1)]	@ save inp
 	str	r14,  [sp,#4*(32+0)]	@ save out
 .Loop_outer_enter:
 	ldr	@t[3], [sp,#4*(15)]
 	 mov	@x[4],@x[4],ror#19	@ twist b[0..3]
 	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
 	 mov	@x[5],@x[5],ror#19
 	ldr	@t[2], [sp,#4*(13)]
 	 mov	@x[6],@x[6],ror#19
 	ldr	@x[14],[sp,#4*(14)]
 	 mov	@x[7],@x[7],ror#19
 	mov	@t[3],@t[3],ror#8	@ twist d[0..3]
 	mov	@x[12],@x[12],ror#8
 	mov	@t[2],@t[2],ror#8
 	mov	@x[14],@x[14],ror#8
 	str	@t[3], [sp,#4*(16+15)]
 	mov	@t[3],#10
 	b	.Loop

 .align	4
 .Loop:
 	subs	@t[3],@t[3],#1
 ___
 	foreach (&ROUND(0, 4, 8,12)) { eval; }
 	foreach (&ROUND(0, 5,10,15)) { eval; }
 $code.=<<___;
 	bne	.Loop

 	ldr	@t[3],[sp,#4*(32+2)]	@ load len

 	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
 	str	@t[1], [sp,#4*(16+9)]
 	str	@x[12],[sp,#4*(16+12)]
 	str	@t[2], [sp,#4*(16+13)]
 	str	@x[14],[sp,#4*(16+14)]

 	@ at this point we have first half of 512-bit result in
 	@ @x[0-7] and second half at sp+4*(16+8)

 	cmp	@t[3],#64		@ done yet?
 #ifdef	__thumb2__
 	itete	lo
 #endif
 	addlo	r12,sp,#4*(0)		@ shortcut or ...
 	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
 	addlo	r14,sp,#4*(0)		@ shortcut or ...
 	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out

 	ldr	@t[0],[sp,#4*(0)]	@ load key material
 	ldr	@t[1],[sp,#4*(1)]

 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
 # if __ARM_ARCH__<7
 	orr	@t[2],r12,r14
 	tst	@t[2],#3		@ are input and output aligned?
 	ldr	@t[2],[sp,#4*(2)]
 	bne	.Lunaligned
 	cmp	@t[3],#64		@ restore flags
 # else
 	ldr	@t[2],[sp,#4*(2)]
 # endif
 	ldr	@t[3],[sp,#4*(3)]

 	add	@x[0],@x[0],@t[0]	@ accumulate key material
 	add	@x[1],@x[1],@t[1]
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[0],[r12],#16		@ load input
 	ldrhs	@t[1],[r12,#-12]

 	add	@x[2],@x[2],@t[2]
 	add	@x[3],@x[3],@t[3]
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[2],[r12,#-8]
 	ldrhs	@t[3],[r12,#-4]
 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
 	rev	@x[0],@x[0]
 	rev	@x[1],@x[1]
 	rev	@x[2],@x[2]
 	rev	@x[3],@x[3]
 # endif
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[0],@x[0],@t[0]	@ xor with input
 	eorhs	@x[1],@x[1],@t[1]
 	 add	@t[0],sp,#4*(4)
 	str	@x[0],[r14],#16		@ store output
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[2],@x[2],@t[2]
 	eorhs	@x[3],@x[3],@t[3]
 	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
 	str	@x[1],[r14,#-12]
 	str	@x[2],[r14,#-8]
 	str	@x[3],[r14,#-4]

 	add	@x[4],@t[0],@x[4],ror#13 @ accumulate key material
 	add	@x[5],@t[1],@x[5],ror#13
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[0],[r12],#16		@ load input
 	ldrhs	@t[1],[r12,#-12]
 	add	@x[6],@t[2],@x[6],ror#13
 	add	@x[7],@t[3],@x[7],ror#13
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[2],[r12,#-8]
 	ldrhs	@t[3],[r12,#-4]
 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
 	rev	@x[4],@x[4]
 	rev	@x[5],@x[5]
 	rev	@x[6],@x[6]
 	rev	@x[7],@x[7]
 # endif
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[4],@x[4],@t[0]
 	eorhs	@x[5],@x[5],@t[1]
 	 add	@t[0],sp,#4*(8)
 	str	@x[4],[r14],#16		@ store output
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[6],@x[6],@t[2]
 	eorhs	@x[7],@x[7],@t[3]
 	str	@x[5],[r14,#-12]
 	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
 	str	@x[6],[r14,#-8]
 	 add	@x[0],sp,#4*(16+8)
 	str	@x[7],[r14,#-4]

 	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half

 	add	@x[0],@x[0],@t[0]	@ accumulate key material
 	add	@x[1],@x[1],@t[1]
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[0],[r12],#16		@ load input
 	ldrhs	@t[1],[r12,#-12]
 # ifdef	__thumb2__
 	itt	hi
 # endif
 	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
 	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
 	add	@x[2],@x[2],@t[2]
 	add	@x[3],@x[3],@t[3]
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[2],[r12,#-8]
 	ldrhs	@t[3],[r12,#-4]
 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
 	rev	@x[0],@x[0]
 	rev	@x[1],@x[1]
 	rev	@x[2],@x[2]
 	rev	@x[3],@x[3]
 # endif
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[0],@x[0],@t[0]
 	eorhs	@x[1],@x[1],@t[1]
 	 add	@t[0],sp,#4*(12)
 	str	@x[0],[r14],#16		@ store output
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[2],@x[2],@t[2]
 	eorhs	@x[3],@x[3],@t[3]
 	str	@x[1],[r14,#-12]
 	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
 	str	@x[2],[r14,#-8]
 	str	@x[3],[r14,#-4]

 	add	@x[4],@t[0],@x[4],ror#24 @ accumulate key material
 	add	@x[5],@t[1],@x[5],ror#24
 # ifdef	__thumb2__
 	itt	hi
 # endif
 	 addhi	@t[0],@t[0],#1		@ next counter value
 	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[0],[r12],#16		@ load input
 	ldrhs	@t[1],[r12,#-12]
 	add	@x[6],@t[2],@x[6],ror#24
 	add	@x[7],@t[3],@x[7],ror#24
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhs	@t[2],[r12,#-8]
 	ldrhs	@t[3],[r12,#-4]
 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
 	rev	@x[4],@x[4]
 	rev	@x[5],@x[5]
 	rev	@x[6],@x[6]
 	rev	@x[7],@x[7]
 # endif
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[4],@x[4],@t[0]
 	eorhs	@x[5],@x[5],@t[1]
 # ifdef	__thumb2__
 	 it	ne
 # endif
 	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	eorhs	@x[6],@x[6],@t[2]
 	eorhs	@x[7],@x[7],@t[3]
 	str	@x[4],[r14],#16		@ store output
 	str	@x[5],[r14,#-12]
 # ifdef	__thumb2__
 	it	hs
 # endif
 	 subhs	@t[3],@t[0],#64		@ len-=64
 	str	@x[6],[r14,#-8]
 	str	@x[7],[r14,#-4]
 	bhi	.Loop_outer

 	beq	.Ldone
 # if __ARM_ARCH__<7
 	b	.Ltail

 .align	4
 .Lunaligned:				@ unaligned endian-neutral path
 	cmp	@t[3],#64		@ restore flags
 # endif
 #endif
 #if __ARM_ARCH__<7
 	ldr	@t[3],[sp,#4*(3)]
 ___
 for ($i=0;$i<16;$i+=4) {
 my $j=$i&0x7;
 my $twist="";
 if ($i==4)     { $twist = ",ror#13"; }
 elsif ($i==12) { $twist = ",ror#24"; }

 $code.=<<___	if ($i==4);
 	add	@x[0],sp,#4*(16+8)
 ___
 $code.=<<___	if ($i==8);
 	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
 # ifdef	__thumb2__
 	itt	hi
 # endif
 	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
 	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
 ___
 $code.=<<___;
 	add	@x[$j+0],@t[0],@x[$j+0]$twist	@ accumulate key material
 ___
 $code.=<<___	if ($i==12);
 # ifdef	__thumb2__
 	itt	hi
 # endif
 	addhi	@t[0],@t[0],#1			@ next counter value
 	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
 ___
 $code.=<<___;
 	add	@x[$j+1],@t[1],@x[$j+1]$twist
 	add	@x[$j+2],@t[2],@x[$j+2]$twist
 # ifdef	__thumb2__
 	itete	lo
 # endif
 	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
 	ldrhsb	@t[0],[r12],#16			@ ... load input
 	eorlo	@t[1],@t[1],@t[1]
 	ldrhsb	@t[1],[r12,#-12]

 	add	@x[$j+3],@t[3],@x[$j+3]$twist
 # ifdef	__thumb2__
 	itete	lo
 # endif
 	eorlo	@t[2],@t[2],@t[2]
 	ldrhsb	@t[2],[r12,#-8]
 	eorlo	@t[3],@t[3],@t[3]
 	ldrhsb	@t[3],[r12,#-4]

 	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
 	eor	@x[$j+1],@t[1],@x[$j+1]
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhsb	@t[0],[r12,#-15]		@ load more input
 	ldrhsb	@t[1],[r12,#-11]
 	eor	@x[$j+2],@t[2],@x[$j+2]
 	 strb	@x[$j+0],[r14],#16		@ store output
 	eor	@x[$j+3],@t[3],@x[$j+3]
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhsb	@t[2],[r12,#-7]
 	ldrhsb	@t[3],[r12,#-3]
 	 strb	@x[$j+1],[r14,#-12]
 	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
 	 strb	@x[$j+2],[r14,#-8]
 	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhsb	@t[0],[r12,#-14]		@ load more input
 	ldrhsb	@t[1],[r12,#-10]
 	 strb	@x[$j+3],[r14,#-4]
 	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
 	 strb	@x[$j+0],[r14,#-15]
 	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhsb	@t[2],[r12,#-6]
 	ldrhsb	@t[3],[r12,#-2]
 	 strb	@x[$j+1],[r14,#-11]
 	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
 	 strb	@x[$j+2],[r14,#-7]
 	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhsb	@t[0],[r12,#-13]		@ load more input
 	ldrhsb	@t[1],[r12,#-9]
 	 strb	@x[$j+3],[r14,#-3]
 	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
 	 strb	@x[$j+0],[r14,#-14]
 	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
 # ifdef	__thumb2__
 	itt	hs
 # endif
 	ldrhsb	@t[2],[r12,#-5]
 	ldrhsb	@t[3],[r12,#-1]
 	 strb	@x[$j+1],[r14,#-10]
 	 strb	@x[$j+2],[r14,#-6]
 	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
 	 strb	@x[$j+3],[r14,#-2]
 	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
 	 strb	@x[$j+0],[r14,#-13]
 	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
 	 strb	@x[$j+1],[r14,#-9]
 	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
 	 strb	@x[$j+2],[r14,#-5]
 	 strb	@x[$j+3],[r14,#-1]
 ___
 $code.=<<___	if ($i<12);
 	add	@t[0],sp,#4*(4+$i)
 	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
 ___
 }
 $code.=<<___;
 # ifdef	__thumb2__
 	it	ne
 # endif
 	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
 # ifdef	__thumb2__
 	it	hs
 # endif
 	subhs	@t[3],@t[0],#64			@ len-=64
 	bhi	.Loop_outer

 	beq	.Ldone
 #endif

 .Ltail:
 	ldr	r12,[sp,#4*(32+1)]	@ load inp
 	add	@t[1],sp,#4*(0)
 	ldr	r14,[sp,#4*(32+0)]	@ load out

 .Loop_tail:
 	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
 	ldrb	@t[3],[r12],#1		@ read input
 	subs	@t[0],@t[0],#1
 	eor	@t[3],@t[3],@t[2]
 	strb	@t[3],[r14],#1		@ store output
 	bne	.Loop_tail

 .Ldone:
 	add	sp,sp,#4*(32+3)
 .Lno_data:
 #if __ARM_ARCH__>=5
 	ldmia	sp!,{r4-r11,pc}
 #else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.long	0xe12fff1e		@ interoperable with Thumb ISA:-)
 #endif
 .size	ChaCha20_ctr32,.-ChaCha20_ctr32
 ___

 {{{
 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
     map("q$_",(0..15));

 # This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
 # Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
 sub vperm()
 { my ($dst,$src,$tbl) = @_;
     $code .= "	vtbl.8	$dst#lo,{$src#lo},$tbl#lo\n";
     $code .= "	vtbl.8	$dst#hi,{$src#hi},$tbl#lo\n";
 }

 sub NEONROUND {
 my $odd = pop;
 my ($a,$b,$c,$d,$t)=@_;

 	(
 	"&vadd_i32	($a,$a,$b)",
 	"&veor		($d,$d,$a)",
 	"&vrev32_16	($d,$d)",	# vrot ($d,16)

 	"&vadd_i32	($c,$c,$d)",
 	"&veor		($t,$b,$c)",
 	"&vshr_u32	($b,$t,20)",
 	"&vsli_32	($b,$t,12)",

 	"&vadd_i32	($a,$a,$b)",
 	"&veor		($t,$d,$a)",
 	"&vshr_u32	($d,$t,24)",
 	"&vsli_32	($d,$t,8)",
 	#"&vperm	($d,$t,$t3)",

 	"&vadd_i32	($c,$c,$d)",
 	"&veor		($t,$b,$c)",
 	"&vshr_u32	($b,$t,25)",
 	"&vsli_32	($b,$t,7)",

 	"&vext_8	($a,$a,$a,$odd?4:12)",
 	"&vext_8	($d,$d,$d,8)",
 	"&vext_8	($c,$c,$c,$odd?12:4)"
 	);
 }

 $code.=<<___;
 #if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
 .arch	armv7-a
 .fpu	neon

 # ifdef __KERNEL__
 .globl	ChaCha20_neon
 @ For optimal performance it's appropriate for caller to enforce
 @ minimum input length, 193 bytes is suggested.
 # endif
 .type	ChaCha20_neon,%function
 .align	5
 ChaCha20_neon:
 	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
 	stmdb		sp!,{r0-r2,r4-r11,lr}
 .LChaCha20_neon:
 	adr		r14,.Lsigma
 	vstmdb		sp!,{d8-d15}		@ ABI spec says so
 	stmdb		sp!,{r0-r3}

 	vld1.32		{$b0-$c0},[r3]		@ load key
 	ldmia		r3,{r4-r11}		@ load key

 	sub		sp,sp,#4*(16+16)
 	vld1.32		{$d0},[r12]		@ load counter and nonce
 	add		r12,sp,#4*8
 	ldmia		r14,{r0-r3}		@ load sigma
 	vld1.32		{$a0},[r14]!		@ load sigma
 	vld1.32		{$t0},[r14]!		@ one
 	@ vld1.32	{$t3#lo},[r14]		@ rot8
 	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
 	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key

 	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
 	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
 	vshl.i32	$t1#lo,$t0#lo,#1	@ two
 	vstr		$t0#lo,[sp,#4*(16+0)]
 	vshl.i32	$t2#lo,$t0#lo,#2	@ four
 	vstr		$t1#lo,[sp,#4*(16+2)]
 	vmov		$a1,$a0
 	vstr		$t2#lo,[sp,#4*(16+4)]
 	vmov		$a2,$a0
 	@ vstr		$t3#lo,[sp,#4*(16+6)]
 	vmov		$b1,$b0
 	vmov		$b2,$b0
 	b		.Loop_neon_enter

 .align	4
 .Loop_neon_outer:
 	ldmia		sp,{r0-r9}		@ load key material
 	cmp		@t[3],#64*2		@ if len<=64*2
 	bls		.Lbreak_neon		@ switch to integer-only
 	@ vldr		$t3#lo,[sp,#4*(16+6)]	@ rot8
 	vmov		$a1,$a0
 	str		@t[3],[sp,#4*(32+2)]	@ save len
 	vmov		$a2,$a0
 	str		r12,  [sp,#4*(32+1)]	@ save inp
 	vmov		$b1,$b0
 	str		r14,  [sp,#4*(32+0)]	@ save out
 	vmov		$b2,$b0
 .Loop_neon_enter:
 	ldr		@t[3], [sp,#4*(15)]
 	 mov		@x[4],@x[4],ror#19	@ twist b[0..3]
 	vadd.i32	$d1,$d0,$t0		@ counter+1
 	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
 	 mov		@x[5],@x[5],ror#19
 	vmov		$c1,$c0
 	ldr		@t[2], [sp,#4*(13)]
 	 mov		@x[6],@x[6],ror#19
 	vmov		$c2,$c0
 	ldr		@x[14],[sp,#4*(14)]
 	 mov		@x[7],@x[7],ror#19
 	vadd.i32	$d2,$d1,$t0		@ counter+2
 	add		@x[12],@x[12],#3	@ counter+3
 	mov		@t[3],@t[3],ror#8	@ twist d[0..3]
 	mov		@x[12],@x[12],ror#8
 	mov		@t[2],@t[2],ror#8
 	mov		@x[14],@x[14],ror#8
 	str		@t[3], [sp,#4*(16+15)]
 	mov		@t[3],#10
 	b		.Loop_neon

 .align	4
 .Loop_neon:
 	subs		@t[3],@t[3],#1
 ___
 	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
 	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
 	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
 	my @thread3=&ROUND(0,4,8,12);

 	foreach (@thread0) {
 		eval;			eval(shift(@thread3));
 		eval(shift(@thread1));	eval(shift(@thread3));
 		eval(shift(@thread2));	eval(shift(@thread3));
 	}

 	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
 	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
 	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
 	@thread3=&ROUND(0,5,10,15);

 	foreach (@thread0) {
 		eval;			eval(shift(@thread3));
 		eval(shift(@thread1));	eval(shift(@thread3));
 		eval(shift(@thread2));	eval(shift(@thread3));
 	}
 $code.=<<___;
 	bne		.Loop_neon

 	add		@t[3],sp,#32
 	vld1.32		{$t0-$t1},[sp]		@ load key material
 	vld1.32		{$t2-$t3},[@t[3]]

 	ldr		@t[3],[sp,#4*(32+2)]	@ load len

 	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
 	str		@t[1], [sp,#4*(16+9)]
 	str		@x[12],[sp,#4*(16+12)]
 	str		@t[2], [sp,#4*(16+13)]
 	str		@x[14],[sp,#4*(16+14)]

 	@ at this point we have first half of 512-bit result in
 	@ @x[0-7] and second half at sp+4*(16+8)

 	ldr		r12,[sp,#4*(32+1)]	@ load inp
 	ldr		r14,[sp,#4*(32+0)]	@ load out

 	vadd.i32	$a0,$a0,$t0		@ accumulate key material
 	vadd.i32	$a1,$a1,$t0
 	vadd.i32	$a2,$a2,$t0
 	vldr		$t0#lo,[sp,#4*(16+0)]	@ one

 	vadd.i32	$b0,$b0,$t1
 	vadd.i32	$b1,$b1,$t1
 	vadd.i32	$b2,$b2,$t1
 	vldr		$t1#lo,[sp,#4*(16+2)]	@ two

 	vadd.i32	$c0,$c0,$t2
 	vadd.i32	$c1,$c1,$t2
 	vadd.i32	$c2,$c2,$t2
 	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
 	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2

 	vadd.i32	$d0,$d0,$t3
 	vadd.i32	$d1,$d1,$t3
 	vadd.i32	$d2,$d2,$t3

 	cmp		@t[3],#64*4
 	blo		.Ltail_neon

 	vld1.8		{$t0-$t1},[r12]!	@ load input
 	 mov		@t[3],sp
 	vld1.8		{$t2-$t3},[r12]!
 	veor		$a0,$a0,$t0		@ xor with input
 	veor		$b0,$b0,$t1
 	vld1.8		{$t0-$t1},[r12]!
 	veor		$c0,$c0,$t2
 	veor		$d0,$d0,$t3
 	vld1.8		{$t2-$t3},[r12]!

 	veor		$a1,$a1,$t0
 	 vst1.8		{$a0-$b0},[r14]!	@ store output
 	veor		$b1,$b1,$t1
 	vld1.8		{$t0-$t1},[r12]!
 	veor		$c1,$c1,$t2
 	 vst1.8		{$c0-$d0},[r14]!
 	veor		$d1,$d1,$t3
 	vld1.8		{$t2-$t3},[r12]!

 	veor		$a2,$a2,$t0
 	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
 	 veor		$t0#hi,$t0#hi,$t0#hi
 	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
 	veor		$b2,$b2,$t1
 	 vld1.32	{$c0-$d0},[@t[3]]
 	veor		$c2,$c2,$t2
 	 vst1.8		{$a1-$b1},[r14]!
 	veor		$d2,$d2,$t3
 	 vst1.8		{$c1-$d1},[r14]!

 	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
 	vldr		$t0#lo,[sp,#4*(16+0)]	@ one

 	ldmia		sp,{@t[0]-@t[3]}	@ load key material
 	add		@x[0],@x[0],@t[0]	@ accumulate key material
 	ldr		@t[0],[r12],#16		@ load input
 	 vst1.8		{$a2-$b2},[r14]!
 	add		@x[1],@x[1],@t[1]
 	ldr		@t[1],[r12,#-12]
 	 vst1.8		{$c2-$d2},[r14]!
 	add		@x[2],@x[2],@t[2]
 	ldr		@t[2],[r12,#-8]
 	add		@x[3],@x[3],@t[3]
 	ldr		@t[3],[r12,#-4]
 # ifdef	__ARMEB__
 	rev		@x[0],@x[0]
 	rev		@x[1],@x[1]
 	rev		@x[2],@x[2]
 	rev		@x[3],@x[3]
 # endif
 	eor		@x[0],@x[0],@t[0]	@ xor with input
 	 add		@t[0],sp,#4*(4)
 	eor		@x[1],@x[1],@t[1]
 	str		@x[0],[r14],#16		@ store output
 	eor		@x[2],@x[2],@t[2]
 	str		@x[1],[r14,#-12]
 	eor		@x[3],@x[3],@t[3]
 	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
 	str		@x[2],[r14,#-8]
 	str		@x[3],[r14,#-4]

 	add		@x[4],@t[0],@x[4],ror#13 @ accumulate key material
 	ldr		@t[0],[r12],#16		@ load input
 	add		@x[5],@t[1],@x[5],ror#13
 	ldr		@t[1],[r12,#-12]
 	add		@x[6],@t[2],@x[6],ror#13
 	ldr		@t[2],[r12,#-8]
 	add		@x[7],@t[3],@x[7],ror#13
 	ldr		@t[3],[r12,#-4]
 # ifdef	__ARMEB__
 	rev		@x[4],@x[4]
 	rev		@x[5],@x[5]
 	rev		@x[6],@x[6]
 	rev		@x[7],@x[7]
 # endif
 	eor		@x[4],@x[4],@t[0]
 	 add		@t[0],sp,#4*(8)
 	eor		@x[5],@x[5],@t[1]
 	str		@x[4],[r14],#16		@ store output
 	eor		@x[6],@x[6],@t[2]
 	str		@x[5],[r14,#-12]
 	eor		@x[7],@x[7],@t[3]
 	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
 	str		@x[6],[r14,#-8]
 	 add		@x[0],sp,#4*(16+8)
 	str		@x[7],[r14,#-4]

 	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half

 	add		@x[0],@x[0],@t[0]	@ accumulate key material
 	ldr		@t[0],[r12],#16		@ load input
 	add		@x[1],@x[1],@t[1]
 	ldr		@t[1],[r12,#-12]
 # ifdef	__thumb2__
 	it	hi
 # endif
 	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
 	add		@x[2],@x[2],@t[2]
 	ldr		@t[2],[r12,#-8]
 # ifdef	__thumb2__
 	it	hi
 # endif
 	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
 	add		@x[3],@x[3],@t[3]
 	ldr		@t[3],[r12,#-4]
 # ifdef	__ARMEB__
 	rev		@x[0],@x[0]
 	rev		@x[1],@x[1]
 	rev		@x[2],@x[2]
 	rev		@x[3],@x[3]
 # endif
 	eor		@x[0],@x[0],@t[0]
 	 add		@t[0],sp,#4*(12)
 	eor		@x[1],@x[1],@t[1]
 	str		@x[0],[r14],#16		@ store output
 	eor		@x[2],@x[2],@t[2]
 	str		@x[1],[r14,#-12]
 	eor		@x[3],@x[3],@t[3]
 	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
 	str		@x[2],[r14,#-8]
 	str		@x[3],[r14,#-4]

 	add		@x[4],@t[0],@x[4],ror#24 @ accumulate key material
 	 add		@t[0],@t[0],#4		@ next counter value
 	add		@x[5],@t[1],@x[5],ror#24
 	 str		@t[0],[sp,#4*(12)]	@ save next counter value
 	ldr		@t[0],[r12],#16		@ load input
 	add		@x[6],@t[2],@x[6],ror#24
 	 add		@x[4],@x[4],#3		@ counter+3
 	ldr		@t[1],[r12,#-12]
 	add		@x[7],@t[3],@x[7],ror#24
 	ldr		@t[2],[r12,#-8]
 	ldr		@t[3],[r12,#-4]
 # ifdef	__ARMEB__
 	rev		@x[4],@x[4]
 	rev		@x[5],@x[5]
 	rev		@x[6],@x[6]
 	rev		@x[7],@x[7]
 # endif
 	eor		@x[4],@x[4],@t[0]
 # ifdef	__thumb2__
 	it	hi
 # endif
 	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
 	eor		@x[5],@x[5],@t[1]
 	eor		@x[6],@x[6],@t[2]
 	str		@x[4],[r14],#16		@ store output
 	eor		@x[7],@x[7],@t[3]
 	str		@x[5],[r14,#-12]
 	 sub		@t[3],@t[0],#64*4	@ len-=64*4
 	str		@x[6],[r14,#-8]
 	str		@x[7],[r14,#-4]
 	bhi		.Loop_neon_outer

 	b		.Ldone_neon

 .align	4
 .Lbreak_neon:
 	@ harmonize NEON and integer-only stack frames: load data
 	@ from NEON frame, but save to integer-only one; distance
 	@ between the two is 4*(32+4+16-32)=4*(20).

 	str		@t[3], [sp,#4*(20+32+2)]	@ save len
 	 add		@t[3],sp,#4*(32+4)
 	str		r12,   [sp,#4*(20+32+1)]	@ save inp
 	str		r14,   [sp,#4*(20+32+0)]	@ save out

 	ldr		@x[12],[sp,#4*(16+10)]
 	ldr		@x[14],[sp,#4*(16+11)]
 	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
 	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
 	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"

 	ldr		@t[3], [sp,#4*(15)]
 	 mov		@x[4],@x[4],ror#19		@ twist b[0..3]
 	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
 	 mov		@x[5],@x[5],ror#19
 	ldr		@t[2], [sp,#4*(13)]
 	 mov		@x[6],@x[6],ror#19
 	ldr		@x[14],[sp,#4*(14)]
 	 mov		@x[7],@x[7],ror#19
 	mov		@t[3],@t[3],ror#8		@ twist d[0..3]
 	mov		@x[12],@x[12],ror#8
 	mov		@t[2],@t[2],ror#8
 	mov		@x[14],@x[14],ror#8
 	str		@t[3], [sp,#4*(20+16+15)]
 	add		@t[3],sp,#4*(20)
 	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
 	add		sp,sp,#4*(20)			@ switch frame
 	vst1.32		{$c0-$d0},[@t[3]]
 	mov		@t[3],#10
 	b		.Loop				@ go integer-only

 .align	4
 .Ltail_neon:
 	cmp		@t[3],#64*3
 	bhs		.L192_or_more_neon
 	cmp		@t[3],#64*2
 	bhs		.L128_or_more_neon
 	cmp		@t[3],#64*1
 	bhs		.L64_or_more_neon

 	add		@t[0],sp,#4*(8)
 	vst1.8		{$a0-$b0},[sp]
 	add		@t[2],sp,#4*(0)
 	vst1.8		{$c0-$d0},[@t[0]]
 	b		.Loop_tail_neon

 .align	4
 .L64_or_more_neon:
 	vld1.8		{$t0-$t1},[r12]!
 	vld1.8		{$t2-$t3},[r12]!
 	veor		$a0,$a0,$t0
 	veor		$b0,$b0,$t1
 	veor		$c0,$c0,$t2
 	veor		$d0,$d0,$t3
 	vst1.8		{$a0-$b0},[r14]!
 	vst1.8		{$c0-$d0},[r14]!

 	beq		.Ldone_neon

 	add		@t[0],sp,#4*(8)
 	vst1.8		{$a1-$b1},[sp]
 	add		@t[2],sp,#4*(0)
 	vst1.8		{$c1-$d1},[@t[0]]
 	sub		@t[3],@t[3],#64*1	@ len-=64*1
 	b		.Loop_tail_neon

 .align	4
 .L128_or_more_neon:
 	vld1.8		{$t0-$t1},[r12]!
 	vld1.8		{$t2-$t3},[r12]!
 	veor		$a0,$a0,$t0
 	veor		$b0,$b0,$t1
 	vld1.8		{$t0-$t1},[r12]!
 	veor		$c0,$c0,$t2
 	veor		$d0,$d0,$t3
 	vld1.8		{$t2-$t3},[r12]!

 	veor		$a1,$a1,$t0
 	veor		$b1,$b1,$t1
 	 vst1.8		{$a0-$b0},[r14]!
 	veor		$c1,$c1,$t2
 	 vst1.8		{$c0-$d0},[r14]!
 	veor		$d1,$d1,$t3
 	vst1.8		{$a1-$b1},[r14]!
 	vst1.8		{$c1-$d1},[r14]!

 	beq		.Ldone_neon

 	add		@t[0],sp,#4*(8)
 	vst1.8		{$a2-$b2},[sp]
 	add		@t[2],sp,#4*(0)
 	vst1.8		{$c2-$d2},[@t[0]]
 	sub		@t[3],@t[3],#64*2	@ len-=64*2
 	b		.Loop_tail_neon

 .align	4
 .L192_or_more_neon:
 	vld1.8		{$t0-$t1},[r12]!
 	vld1.8		{$t2-$t3},[r12]!
 	veor		$a0,$a0,$t0
 	veor		$b0,$b0,$t1
 	vld1.8		{$t0-$t1},[r12]!
 	veor		$c0,$c0,$t2
 	veor		$d0,$d0,$t3
 	vld1.8		{$t2-$t3},[r12]!

 	veor		$a1,$a1,$t0
 	veor		$b1,$b1,$t1
 	vld1.8		{$t0-$t1},[r12]!
 	veor		$c1,$c1,$t2
 	 vst1.8		{$a0-$b0},[r14]!
 	veor		$d1,$d1,$t3
 	vld1.8		{$t2-$t3},[r12]!

 	veor		$a2,$a2,$t0
 	 vst1.8		{$c0-$d0},[r14]!
 	veor		$b2,$b2,$t1
 	 vst1.8		{$a1-$b1},[r14]!
 	veor		$c2,$c2,$t2
 	 vst1.8		{$c1-$d1},[r14]!
 	veor		$d2,$d2,$t3
 	vst1.8		{$a2-$b2},[r14]!
 	vst1.8		{$c2-$d2},[r14]!

 	beq		.Ldone_neon

 	ldmia		sp,{@t[0]-@t[3]}	@ load key material
 	add		@x[0],@x[0],@t[0]	@ accumulate key material
 	 add		@t[0],sp,#4*(4)
 	add		@x[1],@x[1],@t[1]
 	add		@x[2],@x[2],@t[2]
 	add		@x[3],@x[3],@t[3]
 	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material

 	add		@x[4],@t[0],@x[4],ror#13 @ accumulate key material
 	 add		@t[0],sp,#4*(8)
 	add		@x[5],@t[1],@x[5],ror#13
 	add		@x[6],@t[2],@x[6],ror#13
 	add		@x[7],@t[3],@x[7],ror#13
 	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
 # ifdef	__ARMEB__
 	rev		@x[0],@x[0]
 	rev		@x[1],@x[1]
 	rev		@x[2],@x[2]
 	rev		@x[3],@x[3]
 	rev		@x[4],@x[4]
 	rev		@x[5],@x[5]
 	rev		@x[6],@x[6]
 	rev		@x[7],@x[7]
 # endif
 	stmia		sp,{@x[0]-@x[7]}
 	 add		@x[0],sp,#4*(16+8)

 	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half

 	add		@x[0],@x[0],@t[0]	@ accumulate key material
 	 add		@t[0],sp,#4*(12)
 	add		@x[1],@x[1],@t[1]
 	add		@x[2],@x[2],@t[2]
 	add		@x[3],@x[3],@t[3]
 	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material

 	add		@x[4],@t[0],@x[4],ror#24 @ accumulate key material
 	 add		@t[0],sp,#4*(8)
 	add		@x[5],@t[1],@x[5],ror#24
 	 add		@x[4],@x[4],#3		@ counter+3
 	add		@x[6],@t[2],@x[6],ror#24
 	add		@x[7],@t[3],@x[7],ror#24
 	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
 # ifdef	__ARMEB__
 	rev		@x[0],@x[0]
 	rev		@x[1],@x[1]
 	rev		@x[2],@x[2]
 	rev		@x[3],@x[3]
 	rev		@x[4],@x[4]
 	rev		@x[5],@x[5]
 	rev		@x[6],@x[6]
 	rev		@x[7],@x[7]
 # endif
 	stmia		@t[0],{@x[0]-@x[7]}
 	 add		@t[2],sp,#4*(0)
 	 sub		@t[3],@t[3],#64*3	@ len-=64*3

 .Loop_tail_neon:
 	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
 	ldrb		@t[1],[r12],#1		@ read input
 	subs		@t[3],@t[3],#1
 	eor		@t[0],@t[0],@t[1]
 	strb		@t[0],[r14],#1		@ store output
 	bne		.Loop_tail_neon

 .Ldone_neon:
 	add		sp,sp,#4*(32+4)
 	vldmia		sp,{d8-d15}
 	add		sp,sp,#4*(16+3)
 	ldmia		sp!,{r4-r11,pc}
 .size	ChaCha20_neon,.-ChaCha20_neon
 # ifndef __KERNEL__
 .comm	OPENSSL_armcap_P,4,4
 # endif
 #endif
 ___
 }}}

 open SELF,$0;
 while(<SELF>) {
 	next if (/^#!/);
 	last if (!s/^#/@/ and !/^$/);
 	print;
 }
 close SELF;

 foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;

 	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;

 	print $_,"\n";
 }
 close STDOUT;