| #!/usr/bin/env perl |
| # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
| # |
| # This code is taken from the OpenSSL project but the author, Andy Polyakov, |
| # has relicensed it under the licenses specified in the SPDX header above. |
| # The original headers, including the original license headers, are |
| # included below for completeness. |
| # |
| # ==================================================================== |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| # ==================================================================== |
| # |
| # Poly1305 hash for MIPS64. |
| # |
| # May 2016 |
| # |
| # Numbers are cycles per processed byte with poly1305_blocks alone. |
| # |
| # IALU/gcc |
| # R1x000 5.64/+120% (big-endian) |
| # Octeon II 3.80/+280% (little-endian) |
| |
| ###################################################################### |
| # There is a number of MIPS ABI in use, O32 and N32/64 are most |
| # widely used. Then there is a new contender: NUBI. It appears that if |
| # one picks the latter, it's possible to arrange code in ABI neutral |
| # manner. Therefore let's stick to NUBI register layout: |
| # |
| ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); |
| ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); |
| ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); |
| ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); |
| # |
| # The return value is placed in $a0. Following coding rules facilitate |
| # interoperability: |
| # |
| # - never ever touch $tp, "thread pointer", former $gp [o32 can be |
| # excluded from the rule, because it's specified volatile]; |
| # - copy return value to $t0, former $v0 [or to $a0 if you're adapting |
| # old code]; |
| # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; |
| # |
| # For reference here is register layout for N32/64 MIPS ABIs: |
| # |
| # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); |
| # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); |
| # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); |
| # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); |
| # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); |
| # |
| # <appro@openssl.org> |
| # |
| ###################################################################### |
| |
| $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 |
| |
| die "MIPS64 only" unless ($flavour =~ /64|n32/i); |
| |
| $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; |
| $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; |
| |
| ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); |
| ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); |
| |
| $code.=<<___; |
| #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ |
| defined(_MIPS_ARCH_MIPS64R6)) \\ |
| && !defined(_MIPS_ARCH_MIPS64R2) |
| # define _MIPS_ARCH_MIPS64R2 |
| #endif |
| |
| #if defined(_MIPS_ARCH_MIPS64R6) |
| # define dmultu(rs,rt) |
| # define mflo(rd,rs,rt) dmulu rd,rs,rt |
| # define mfhi(rd,rs,rt) dmuhu rd,rs,rt |
| #else |
| # define dmultu(rs,rt) dmultu rs,rt |
| # define mflo(rd,rs,rt) mflo rd |
| # define mfhi(rd,rs,rt) mfhi rd |
| #endif |
| |
| #ifdef __KERNEL__ |
| # define poly1305_init poly1305_init_mips |
| # define poly1305_blocks poly1305_blocks_mips |
| # define poly1305_emit poly1305_emit_mips |
| #endif |
| |
| #if defined(__MIPSEB__) && !defined(MIPSEB) |
| # define MIPSEB |
| #endif |
| |
| #ifdef MIPSEB |
| # define MSB 0 |
| # define LSB 7 |
| #else |
| # define MSB 7 |
| # define LSB 0 |
| #endif |
| |
| .text |
| .set noat |
| .set noreorder |
| |
| .align 5 |
| .globl poly1305_init |
| .ent poly1305_init |
| poly1305_init: |
| .frame $sp,0,$ra |
| .set reorder |
| |
| sd $zero,0($ctx) |
| sd $zero,8($ctx) |
| sd $zero,16($ctx) |
| |
| beqz $inp,.Lno_key |
| |
| #if defined(_MIPS_ARCH_MIPS64R6) |
| ld $in0,0($inp) |
| ld $in1,8($inp) |
| #else |
| ldl $in0,0+MSB($inp) |
| ldl $in1,8+MSB($inp) |
| ldr $in0,0+LSB($inp) |
| ldr $in1,8+LSB($inp) |
| #endif |
| #ifdef MIPSEB |
| # if defined(_MIPS_ARCH_MIPS64R2) |
| dsbh $in0,$in0 # byte swap |
| dsbh $in1,$in1 |
| dshd $in0,$in0 |
| dshd $in1,$in1 |
| # else |
| ori $tmp0,$zero,0xFF |
| dsll $tmp2,$tmp0,32 |
| or $tmp0,$tmp2 # 0x000000FF000000FF |
| |
| and $tmp1,$in0,$tmp0 # byte swap |
| and $tmp3,$in1,$tmp0 |
| dsrl $tmp2,$in0,24 |
| dsrl $tmp4,$in1,24 |
| dsll $tmp1,24 |
| dsll $tmp3,24 |
| and $tmp2,$tmp0 |
| and $tmp4,$tmp0 |
| dsll $tmp0,8 # 0x0000FF000000FF00 |
| or $tmp1,$tmp2 |
| or $tmp3,$tmp4 |
| and $tmp2,$in0,$tmp0 |
| and $tmp4,$in1,$tmp0 |
| dsrl $in0,8 |
| dsrl $in1,8 |
| dsll $tmp2,8 |
| dsll $tmp4,8 |
| and $in0,$tmp0 |
| and $in1,$tmp0 |
| or $tmp1,$tmp2 |
| or $tmp3,$tmp4 |
| or $in0,$tmp1 |
| or $in1,$tmp3 |
| dsrl $tmp1,$in0,32 |
| dsrl $tmp3,$in1,32 |
| dsll $in0,32 |
| dsll $in1,32 |
| or $in0,$tmp1 |
| or $in1,$tmp3 |
| # endif |
| #endif |
| li $tmp0,1 |
| dsll $tmp0,32 |
| daddiu $tmp0,-63 |
| dsll $tmp0,28 |
| daddiu $tmp0,-1 # 0ffffffc0fffffff |
| |
| and $in0,$tmp0 |
| daddiu $tmp0,-3 # 0ffffffc0ffffffc |
| and $in1,$tmp0 |
| |
| sd $in0,24($ctx) |
| dsrl $tmp0,$in1,2 |
| sd $in1,32($ctx) |
| daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) |
| sd $tmp0,40($ctx) |
| |
| .Lno_key: |
| li $v0,0 # return 0 |
| jr $ra |
| .end poly1305_init |
| ___ |
| { |
| my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = |
| ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); |
| |
| $code.=<<___; |
| .align 5 |
| .globl poly1305_blocks |
| .ent poly1305_blocks |
| poly1305_blocks: |
| .set noreorder |
| dsrl $len,4 # number of complete blocks |
| bnez $len,poly1305_blocks_internal |
| nop |
| jr $ra |
| nop |
| .end poly1305_blocks |
| |
| .align 5 |
| .ent poly1305_blocks_internal |
| poly1305_blocks_internal: |
| .frame $sp,6*8,$ra |
| .mask $SAVED_REGS_MASK,-8 |
| .set noreorder |
| dsubu $sp,6*8 |
| sd $s5,40($sp) |
| sd $s4,32($sp) |
| ___ |
| $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue |
| sd $s3,24($sp) |
| sd $s2,16($sp) |
| sd $s1,8($sp) |
| sd $s0,0($sp) |
| ___ |
| $code.=<<___; |
| .set reorder |
| |
| ld $h0,0($ctx) # load hash value |
| ld $h1,8($ctx) |
| ld $h2,16($ctx) |
| |
| ld $r0,24($ctx) # load key |
| ld $r1,32($ctx) |
| ld $s1,40($ctx) |
| |
| .Loop: |
| #if defined(_MIPS_ARCH_MIPS64R6) |
| ld $in0,0($inp) # load input |
| ld $in1,8($inp) |
| #else |
| ldl $in0,0+MSB($inp) # load input |
| ldl $in1,8+MSB($inp) |
| ldr $in0,0+LSB($inp) |
| ldr $in1,8+LSB($inp) |
| #endif |
| daddiu $len,-1 |
| daddiu $inp,16 |
| #ifdef MIPSEB |
| # if defined(_MIPS_ARCH_MIPS64R2) |
| dsbh $in0,$in0 # byte swap |
| dsbh $in1,$in1 |
| dshd $in0,$in0 |
| dshd $in1,$in1 |
| # else |
| ori $tmp0,$zero,0xFF |
| dsll $tmp2,$tmp0,32 |
| or $tmp0,$tmp2 # 0x000000FF000000FF |
| |
| and $tmp1,$in0,$tmp0 # byte swap |
| and $tmp3,$in1,$tmp0 |
| dsrl $tmp2,$in0,24 |
| dsrl $tmp4,$in1,24 |
| dsll $tmp1,24 |
| dsll $tmp3,24 |
| and $tmp2,$tmp0 |
| and $tmp4,$tmp0 |
| dsll $tmp0,8 # 0x0000FF000000FF00 |
| or $tmp1,$tmp2 |
| or $tmp3,$tmp4 |
| and $tmp2,$in0,$tmp0 |
| and $tmp4,$in1,$tmp0 |
| dsrl $in0,8 |
| dsrl $in1,8 |
| dsll $tmp2,8 |
| dsll $tmp4,8 |
| and $in0,$tmp0 |
| and $in1,$tmp0 |
| or $tmp1,$tmp2 |
| or $tmp3,$tmp4 |
| or $in0,$tmp1 |
| or $in1,$tmp3 |
| dsrl $tmp1,$in0,32 |
| dsrl $tmp3,$in1,32 |
| dsll $in0,32 |
| dsll $in1,32 |
| or $in0,$tmp1 |
| or $in1,$tmp3 |
| # endif |
| #endif |
| daddu $h0,$in0 # accumulate input |
| daddu $h1,$in1 |
| sltu $tmp0,$h0,$in0 |
| sltu $tmp1,$h1,$in1 |
| daddu $h1,$tmp0 |
| |
| dmultu ($r0,$h0) # h0*r0 |
| daddu $h2,$padbit |
| sltu $tmp0,$h1,$tmp0 |
| mflo ($d0,$r0,$h0) |
| mfhi ($d1,$r0,$h0) |
| |
| dmultu ($s1,$h1) # h1*5*r1 |
| daddu $tmp0,$tmp1 |
| daddu $h2,$tmp0 |
| mflo ($tmp0,$s1,$h1) |
| mfhi ($tmp1,$s1,$h1) |
| |
| dmultu ($r1,$h0) # h0*r1 |
| daddu $d0,$tmp0 |
| daddu $d1,$tmp1 |
| mflo ($tmp2,$r1,$h0) |
| mfhi ($d2,$r1,$h0) |
| sltu $tmp0,$d0,$tmp0 |
| daddu $d1,$tmp0 |
| |
| dmultu ($r0,$h1) # h1*r0 |
| daddu $d1,$tmp2 |
| sltu $tmp2,$d1,$tmp2 |
| mflo ($tmp0,$r0,$h1) |
| mfhi ($tmp1,$r0,$h1) |
| daddu $d2,$tmp2 |
| |
| dmultu ($s1,$h2) # h2*5*r1 |
| daddu $d1,$tmp0 |
| daddu $d2,$tmp1 |
| mflo ($tmp2,$s1,$h2) |
| |
| dmultu ($r0,$h2) # h2*r0 |
| sltu $tmp0,$d1,$tmp0 |
| daddu $d2,$tmp0 |
| mflo ($tmp3,$r0,$h2) |
| |
| daddu $d1,$tmp2 |
| daddu $d2,$tmp3 |
| sltu $tmp2,$d1,$tmp2 |
| daddu $d2,$tmp2 |
| |
| li $tmp0,-4 # final reduction |
| and $tmp0,$d2 |
| dsrl $tmp1,$d2,2 |
| andi $h2,$d2,3 |
| daddu $tmp0,$tmp1 |
| daddu $h0,$d0,$tmp0 |
| sltu $tmp0,$h0,$tmp0 |
| daddu $h1,$d1,$tmp0 |
| sltu $tmp0,$h1,$tmp0 |
| daddu $h2,$h2,$tmp0 |
| |
| bnez $len,.Loop |
| |
| sd $h0,0($ctx) # store hash value |
| sd $h1,8($ctx) |
| sd $h2,16($ctx) |
| |
| .set noreorder |
| ld $s5,40($sp) # epilogue |
| ld $s4,32($sp) |
| ___ |
| $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue |
| ld $s3,24($sp) |
| ld $s2,16($sp) |
| ld $s1,8($sp) |
| ld $s0,0($sp) |
| ___ |
| $code.=<<___; |
| jr $ra |
| daddu $sp,6*8 |
| .end poly1305_blocks_internal |
| ___ |
| } |
| { |
| my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); |
| |
| $code.=<<___; |
| .align 5 |
| .globl poly1305_emit |
| .ent poly1305_emit |
| poly1305_emit: |
| .frame $sp,0,$ra |
| .set reorder |
| |
| ld $tmp0,0($ctx) |
| ld $tmp1,8($ctx) |
| ld $tmp2,16($ctx) |
| |
| daddiu $in0,$tmp0,5 # compare to modulus |
| sltiu $tmp3,$in0,5 |
| daddu $in1,$tmp1,$tmp3 |
| sltu $tmp3,$in1,$tmp3 |
| daddu $tmp2,$tmp2,$tmp3 |
| |
| dsrl $tmp2,2 # see if it carried/borrowed |
| dsubu $tmp2,$zero,$tmp2 |
| nor $tmp3,$zero,$tmp2 |
| |
| and $in0,$tmp2 |
| and $tmp0,$tmp3 |
| and $in1,$tmp2 |
| and $tmp1,$tmp3 |
| or $in0,$tmp0 |
| or $in1,$tmp1 |
| |
| lwu $tmp0,0($nonce) # load nonce |
| lwu $tmp1,4($nonce) |
| lwu $tmp2,8($nonce) |
| lwu $tmp3,12($nonce) |
| dsll $tmp1,32 |
| dsll $tmp3,32 |
| or $tmp0,$tmp1 |
| or $tmp2,$tmp3 |
| |
| daddu $in0,$tmp0 # accumulate nonce |
| daddu $in1,$tmp2 |
| sltu $tmp0,$in0,$tmp0 |
| daddu $in1,$tmp0 |
| |
| dsrl $tmp0,$in0,8 # write mac value |
| dsrl $tmp1,$in0,16 |
| dsrl $tmp2,$in0,24 |
| sb $in0,0($mac) |
| dsrl $tmp3,$in0,32 |
| sb $tmp0,1($mac) |
| dsrl $tmp0,$in0,40 |
| sb $tmp1,2($mac) |
| dsrl $tmp1,$in0,48 |
| sb $tmp2,3($mac) |
| dsrl $tmp2,$in0,56 |
| sb $tmp3,4($mac) |
| dsrl $tmp3,$in1,8 |
| sb $tmp0,5($mac) |
| dsrl $tmp0,$in1,16 |
| sb $tmp1,6($mac) |
| dsrl $tmp1,$in1,24 |
| sb $tmp2,7($mac) |
| |
| sb $in1,8($mac) |
| dsrl $tmp2,$in1,32 |
| sb $tmp3,9($mac) |
| dsrl $tmp3,$in1,40 |
| sb $tmp0,10($mac) |
| dsrl $tmp0,$in1,48 |
| sb $tmp1,11($mac) |
| dsrl $tmp1,$in1,56 |
| sb $tmp2,12($mac) |
| sb $tmp3,13($mac) |
| sb $tmp0,14($mac) |
| sb $tmp1,15($mac) |
| |
| jr $ra |
| .end poly1305_emit |
| .rdata |
| .align 2 |
| ___ |
| } |
| |
| open SELF,$0; |
| while(<SELF>) { |
| next if (/^#!/); |
| last if (!s/^#/\/\// and !/^$/); |
| print; |
| } |
| close SELF; |
| |
| $output=pop and open STDOUT,">$output"; |
| print $code; |
| close STDOUT; |
| |