mirror of
https://github.com/amnezia-vpn/amneziawg-linux-kernel-module.git
synced 2026-05-17 08:26:30 +03:00
chacha20,poly1305: switch to perlasm originals on mips and arm
We also separate out Eric Biggers' Cortex A7 implementation into its own file. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
This commit is contained in:
@@ -7,7 +7,7 @@ shopt -s globstar
|
||||
|
||||
WG="$(readlink -f "$(dirname "$(readlink -f "$0")")/../../src/")"
|
||||
|
||||
for i in "$WG"/**/{*.c,*.h,*.S,*.include} "$WG/Kbuild" "$WG/Kconfig"; do
|
||||
for i in "$WG"/**/{*.c,*.h,*.S,*.S_shipped,*.include} "$WG/Kbuild" "$WG/Kconfig"; do
|
||||
[[ $i == "$WG/tools/"* || $i == "$WG/tests/"* ]] && continue
|
||||
diff -u /dev/null "$i" | sed "s:${WG}:b/net/wireguard:;s:Kbuild:Makefile:"
|
||||
done
|
||||
|
||||
@@ -52,7 +52,7 @@ install:
|
||||
@$(MAKE) -C tools install
|
||||
|
||||
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
|
||||
DKMS_SOURCES := version.h Makefile Kbuild Kconfig dkms.conf $(filter-out version.h wireguard.mod.c tools/% tests/%,$(call rwildcard,,*.c *.h *.S *.include))
|
||||
DKMS_SOURCES := version.h Makefile Kbuild Kconfig dkms.conf $(filter-out version.h wireguard.mod.c tools/% tests/%,$(call rwildcard,,*.c *.h *.S *.pl *.include))
|
||||
dkms-install: $(DKMS_SOURCES)
|
||||
@$(foreach f,$(DKMS_SOURCES),install -v -m0644 -D $(f) $(DESTDIR)$(DKMSDIR)/$(f);)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ endif
|
||||
|
||||
zinc-y += chacha20/chacha20.o
|
||||
zinc-$(CONFIG_ZINC_ARCH_X86_64) += chacha20/chacha20-x86_64.o
|
||||
zinc-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o
|
||||
zinc-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o chacha20/chacha20-unrolled-arm.o
|
||||
zinc-$(CONFIG_ZINC_ARCH_ARM64) += chacha20/chacha20-arm64.o
|
||||
zinc-$(CONFIG_ZINC_ARCH_MIPS) += chacha20/chacha20-mips.o
|
||||
AFLAGS_chacha20-mips.o += -O2 # This is required to fill the branch delay slots
|
||||
@@ -37,6 +37,12 @@ zinc-$(CONFIG_ZINC_ARCH_X86_64) += blake2s/blake2s-x86_64.o
|
||||
zinc-y += curve25519/curve25519.o
|
||||
zinc-$(CONFIG_ZINC_ARCH_ARM) += curve25519/curve25519-arm.o
|
||||
|
||||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $< > $@
|
||||
%.S: %.pl
|
||||
$(call cmd,perlasm)
|
||||
targets += $(patsubst %.o,crypto/zinc/%.S,$(zinc-y))
|
||||
|
||||
wireguard-y += $(addprefix crypto/zinc/,$(zinc-y))
|
||||
ccflags-y += -I$(src)/crypto/include
|
||||
ccflags-$(CONFIG_ZINC_ARCH_X86_64) += -DCONFIG_ZINC_ARCH_X86_64
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
1227
src/crypto/zinc/chacha20/chacha20-arm.pl
Normal file
1227
src/crypto/zinc/chacha20/chacha20-arm.pl
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1164
src/crypto/zinc/chacha20/chacha20-arm64.pl
Normal file
1164
src/crypto/zinc/chacha20/chacha20-arm64.pl
Normal file
File diff suppressed because it is too large
Load Diff
461
src/crypto/zinc/chacha20/chacha20-unrolled-arm.S
Normal file
461
src/crypto/zinc/chacha20/chacha20-unrolled-arm.S
Normal file
@@ -0,0 +1,461 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2018 Google, Inc.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
/*
|
||||
* Design notes:
|
||||
*
|
||||
* 16 registers would be needed to hold the state matrix, but only 14 are
|
||||
* available because 'sp' and 'pc' cannot be used. So we spill the elements
|
||||
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
|
||||
* 'ldrd' and one 'strd' instruction per round.
|
||||
*
|
||||
* All rotates are performed using the implicit rotate operand accepted by the
|
||||
* 'add' and 'eor' instructions. This is faster than using explicit rotate
|
||||
* instructions. To make this work, we allow the values in the second and last
|
||||
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
|
||||
* wrong rotation amount. The rotation amount is then fixed up just in time
|
||||
* when the values are used. 'brot' is the number of bits the values in row 'b'
|
||||
* need to be rotated right to arrive at the correct values, and 'drot'
|
||||
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
|
||||
* that they end up as (25, 24) after every round.
|
||||
*/
|
||||
|
||||
// ChaCha state registers
|
||||
X0 .req r0
|
||||
X1 .req r1
|
||||
X2 .req r2
|
||||
X3 .req r3
|
||||
X4 .req r4
|
||||
X5 .req r5
|
||||
X6 .req r6
|
||||
X7 .req r7
|
||||
X8_X10 .req r8 // shared by x8 and x10
|
||||
X9_X11 .req r9 // shared by x9 and x11
|
||||
X12 .req r10
|
||||
X13 .req r11
|
||||
X14 .req r12
|
||||
X15 .req r14
|
||||
|
||||
.Lexpand_32byte_k:
|
||||
// "expand 32-byte k"
|
||||
.word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
|
||||
|
||||
#ifdef __thumb2__
|
||||
# define adrl adr
|
||||
#endif
|
||||
|
||||
.macro __rev out, in, t0, t1, t2
|
||||
.if __LINUX_ARM_ARCH__ >= 6
|
||||
rev \out, \in
|
||||
.else
|
||||
lsl \t0, \in, #24
|
||||
and \t1, \in, #0xff00
|
||||
and \t2, \in, #0xff0000
|
||||
orr \out, \t0, \in, lsr #24
|
||||
orr \out, \out, \t1, lsl #8
|
||||
orr \out, \out, \t2, lsr #8
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro _le32_bswap x, t0, t1, t2
|
||||
#ifdef __ARMEB__
|
||||
__rev \x, \x, \t0, \t1, \t2
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
|
||||
_le32_bswap \a, \t0, \t1, \t2
|
||||
_le32_bswap \b, \t0, \t1, \t2
|
||||
_le32_bswap \c, \t0, \t1, \t2
|
||||
_le32_bswap \d, \t0, \t1, \t2
|
||||
.endm
|
||||
|
||||
.macro __ldrd a, b, src, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
ldrd \a, \b, [\src, #\offset]
|
||||
#else
|
||||
ldr \a, [\src, #\offset]
|
||||
ldr \b, [\src, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro __strd a, b, dst, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
strd \a, \b, [\dst, #\offset]
|
||||
#else
|
||||
str \a, [\dst, #\offset]
|
||||
str \b, [\dst, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
|
||||
|
||||
// a += b; d ^= a; d = rol(d, 16);
|
||||
add \a1, \a1, \b1, ror #brot
|
||||
add \a2, \a2, \b2, ror #brot
|
||||
eor \d1, \a1, \d1, ror #drot
|
||||
eor \d2, \a2, \d2, ror #drot
|
||||
// drot == 32 - 16 == 16
|
||||
|
||||
// c += d; b ^= c; b = rol(b, 12);
|
||||
add \c1, \c1, \d1, ror #16
|
||||
add \c2, \c2, \d2, ror #16
|
||||
eor \b1, \c1, \b1, ror #brot
|
||||
eor \b2, \c2, \b2, ror #brot
|
||||
// brot == 32 - 12 == 20
|
||||
|
||||
// a += b; d ^= a; d = rol(d, 8);
|
||||
add \a1, \a1, \b1, ror #20
|
||||
add \a2, \a2, \b2, ror #20
|
||||
eor \d1, \a1, \d1, ror #16
|
||||
eor \d2, \a2, \d2, ror #16
|
||||
// drot == 32 - 8 == 24
|
||||
|
||||
// c += d; b ^= c; b = rol(b, 7);
|
||||
add \c1, \c1, \d1, ror #24
|
||||
add \c2, \c2, \d2, ror #24
|
||||
eor \b1, \c1, \b1, ror #20
|
||||
eor \b2, \c2, \b2, ror #20
|
||||
// brot == 32 - 7 == 25
|
||||
.endm
|
||||
|
||||
.macro _doubleround
|
||||
|
||||
// column round
|
||||
|
||||
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
|
||||
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
|
||||
|
||||
// save (x8, x9); restore (x10, x11)
|
||||
__strd X8_X10, X9_X11, sp, 0
|
||||
__ldrd X8_X10, X9_X11, sp, 8
|
||||
|
||||
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
|
||||
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
|
||||
|
||||
.set brot, 25
|
||||
.set drot, 24
|
||||
|
||||
// diagonal round
|
||||
|
||||
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
|
||||
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
|
||||
|
||||
// save (x10, x11); restore (x8, x9)
|
||||
__strd X8_X10, X9_X11, sp, 8
|
||||
__ldrd X8_X10, X9_X11, sp, 0
|
||||
|
||||
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
|
||||
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
|
||||
.endm
|
||||
|
||||
.macro _chacha_permute nrounds
|
||||
.set brot, 0
|
||||
.set drot, 0
|
||||
.rept \nrounds / 2
|
||||
_doubleround
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro _chacha nrounds
|
||||
|
||||
.Lnext_block\@:
|
||||
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
|
||||
// Registers contain x0-x9,x12-x15.
|
||||
|
||||
// Do the core ChaCha permutation to update x0-x15.
|
||||
_chacha_permute \nrounds
|
||||
|
||||
add sp, #8
|
||||
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers contain x0-x9,x12-x15.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
|
||||
push {X8_X10, X9_X11, X12, X13, X14, X15}
|
||||
|
||||
// Load (OUT, IN, LEN).
|
||||
ldr r14, [sp, #96]
|
||||
ldr r12, [sp, #100]
|
||||
ldr r11, [sp, #104]
|
||||
|
||||
orr r10, r14, r12
|
||||
|
||||
// Use slow path if fewer than 64 bytes remain.
|
||||
cmp r11, #64
|
||||
blt .Lxor_slowpath\@
|
||||
|
||||
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
|
||||
// ARMv6+, since ldmia and stmia (used below) still require alignment.
|
||||
tst r10, #3
|
||||
bne .Lxor_slowpath\@
|
||||
|
||||
// Fast path: XOR 64 bytes of aligned data.
|
||||
|
||||
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// x0-x3
|
||||
__ldrd r8, r9, sp, 32
|
||||
__ldrd r10, r11, sp, 40
|
||||
add X0, X0, r8
|
||||
add X1, X1, r9
|
||||
add X2, X2, r10
|
||||
add X3, X3, r11
|
||||
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||
ldmia r12!, {r8-r11}
|
||||
eor X0, X0, r8
|
||||
eor X1, X1, r9
|
||||
eor X2, X2, r10
|
||||
eor X3, X3, r11
|
||||
stmia r14!, {X0-X3}
|
||||
|
||||
// x4-x7
|
||||
__ldrd r8, r9, sp, 48
|
||||
__ldrd r10, r11, sp, 56
|
||||
add X4, r8, X4, ror #brot
|
||||
add X5, r9, X5, ror #brot
|
||||
ldmia r12!, {X0-X3}
|
||||
add X6, r10, X6, ror #brot
|
||||
add X7, r11, X7, ror #brot
|
||||
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||
eor X4, X4, X0
|
||||
eor X5, X5, X1
|
||||
eor X6, X6, X2
|
||||
eor X7, X7, X3
|
||||
stmia r14!, {X4-X7}
|
||||
|
||||
// x8-x15
|
||||
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||
__ldrd r8, r9, sp, 32
|
||||
__ldrd r10, r11, sp, 40
|
||||
add r0, r0, r8 // x8
|
||||
add r1, r1, r9 // x9
|
||||
add r6, r6, r10 // x10
|
||||
add r7, r7, r11 // x11
|
||||
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||
ldmia r12!, {r8-r11}
|
||||
eor r0, r0, r8 // x8
|
||||
eor r1, r1, r9 // x9
|
||||
eor r6, r6, r10 // x10
|
||||
eor r7, r7, r11 // x11
|
||||
stmia r14!, {r0,r1,r6,r7}
|
||||
ldmia r12!, {r0,r1,r6,r7}
|
||||
__ldrd r8, r9, sp, 48
|
||||
__ldrd r10, r11, sp, 56
|
||||
add r2, r8, r2, ror #drot // x12
|
||||
add r3, r9, r3, ror #drot // x13
|
||||
add r4, r10, r4, ror #drot // x14
|
||||
add r5, r11, r5, ror #drot // x15
|
||||
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||
ldr r9, [sp, #72] // load LEN
|
||||
eor r2, r2, r0 // x12
|
||||
eor r3, r3, r1 // x13
|
||||
eor r4, r4, r6 // x14
|
||||
eor r5, r5, r7 // x15
|
||||
subs r9, #64 // decrement and check LEN
|
||||
stmia r14!, {r2-r5}
|
||||
|
||||
beq .Ldone\@
|
||||
|
||||
.Lprepare_for_next_block\@:
|
||||
|
||||
// Stack: x0-x15 OUT IN LEN
|
||||
|
||||
// Increment block counter (x12)
|
||||
add r8, #1
|
||||
|
||||
// Store updated (OUT, IN, LEN)
|
||||
str r14, [sp, #64]
|
||||
str r12, [sp, #68]
|
||||
str r9, [sp, #72]
|
||||
|
||||
mov r14, sp
|
||||
|
||||
// Store updated block counter (x12)
|
||||
str r8, [sp, #48]
|
||||
|
||||
sub sp, #16
|
||||
|
||||
// Reload state and do next block
|
||||
ldmia r14!, {r0-r11} // load x0-x11
|
||||
__strd r10, r11, sp, 8 // store x10-x11 before state
|
||||
ldmia r14, {r10-r12,r14} // load x12-x15
|
||||
b .Lnext_block\@
|
||||
|
||||
.Lxor_slowpath\@:
|
||||
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
|
||||
// We handle it by storing the 64 bytes of keystream to the stack, then
|
||||
// XOR-ing the needed portion with the data.
|
||||
|
||||
// Allocate keystream buffer
|
||||
sub sp, #64
|
||||
mov r14, sp
|
||||
|
||||
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// Save keystream for x0-x3
|
||||
__ldrd r8, r9, sp, 96
|
||||
__ldrd r10, r11, sp, 104
|
||||
add X0, X0, r8
|
||||
add X1, X1, r9
|
||||
add X2, X2, r10
|
||||
add X3, X3, r11
|
||||
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||
stmia r14!, {X0-X3}
|
||||
|
||||
// Save keystream for x4-x7
|
||||
__ldrd r8, r9, sp, 112
|
||||
__ldrd r10, r11, sp, 120
|
||||
add X4, r8, X4, ror #brot
|
||||
add X5, r9, X5, ror #brot
|
||||
add X6, r10, X6, ror #brot
|
||||
add X7, r11, X7, ror #brot
|
||||
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||
add r8, sp, #64
|
||||
stmia r14!, {X4-X7}
|
||||
|
||||
// Save keystream for x8-x15
|
||||
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||
__ldrd r8, r9, sp, 128
|
||||
__ldrd r10, r11, sp, 136
|
||||
add r0, r0, r8 // x8
|
||||
add r1, r1, r9 // x9
|
||||
add r6, r6, r10 // x10
|
||||
add r7, r7, r11 // x11
|
||||
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||
stmia r14!, {r0,r1,r6,r7}
|
||||
__ldrd r8, r9, sp, 144
|
||||
__ldrd r10, r11, sp, 152
|
||||
add r2, r8, r2, ror #drot // x12
|
||||
add r3, r9, r3, ror #drot // x13
|
||||
add r4, r10, r4, ror #drot // x14
|
||||
add r5, r11, r5, ror #drot // x15
|
||||
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||
stmia r14, {r2-r5}
|
||||
|
||||
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
|
||||
// Registers: r8 is block counter, r12 is IN.
|
||||
|
||||
ldr r9, [sp, #168] // LEN
|
||||
ldr r14, [sp, #160] // OUT
|
||||
cmp r9, #64
|
||||
mov r0, sp
|
||||
movle r1, r9
|
||||
movgt r1, #64
|
||||
// r1 is number of bytes to XOR, in range [1, 64]
|
||||
|
||||
.if __LINUX_ARM_ARCH__ < 6
|
||||
orr r2, r12, r14
|
||||
tst r2, #3 // IN or OUT misaligned?
|
||||
bne .Lxor_next_byte\@
|
||||
.endif
|
||||
|
||||
// XOR a word at a time
|
||||
.rept 16
|
||||
subs r1, #4
|
||||
blt .Lxor_words_done\@
|
||||
ldr r2, [r12], #4
|
||||
ldr r3, [r0], #4
|
||||
eor r2, r2, r3
|
||||
str r2, [r14], #4
|
||||
.endr
|
||||
b .Lxor_slowpath_done\@
|
||||
.Lxor_words_done\@:
|
||||
ands r1, r1, #3
|
||||
beq .Lxor_slowpath_done\@
|
||||
|
||||
// XOR a byte at a time
|
||||
.Lxor_next_byte\@:
|
||||
ldrb r2, [r12], #1
|
||||
ldrb r3, [r0], #1
|
||||
eor r2, r2, r3
|
||||
strb r2, [r14], #1
|
||||
subs r1, #1
|
||||
bne .Lxor_next_byte\@
|
||||
|
||||
.Lxor_slowpath_done\@:
|
||||
subs r9, #64
|
||||
add sp, #96
|
||||
bgt .Lprepare_for_next_block\@
|
||||
|
||||
.Ldone\@:
|
||||
.endm // _chacha
|
||||
|
||||
/*
|
||||
* void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
|
||||
* const u32 iv[4]);
|
||||
*/
|
||||
ENTRY(chacha20_arm)
|
||||
cmp r2, #0 // len == 0?
|
||||
reteq lr
|
||||
|
||||
push {r0-r2,r4-r11,lr}
|
||||
|
||||
// Push state x0-x15 onto stack.
|
||||
// Also store an extra copy of x10-x11 just before the state.
|
||||
|
||||
ldr r4, [sp, #48] // iv
|
||||
mov r0, sp
|
||||
sub sp, #80
|
||||
|
||||
// iv: x12-x15
|
||||
ldm r4, {X12,X13,X14,X15}
|
||||
stmdb r0!, {X12,X13,X14,X15}
|
||||
|
||||
// key: x4-x11
|
||||
__ldrd X8_X10, X9_X11, r3, 24
|
||||
__strd X8_X10, X9_X11, sp, 8
|
||||
stmdb r0!, {X8_X10, X9_X11}
|
||||
ldm r3, {X4-X9_X11}
|
||||
stmdb r0!, {X4-X9_X11}
|
||||
|
||||
// constants: x0-x3
|
||||
adrl X3, .Lexpand_32byte_k
|
||||
ldm X3, {X0-X3}
|
||||
__strd X0, X1, sp, 16
|
||||
__strd X2, X3, sp, 24
|
||||
|
||||
_chacha 20
|
||||
|
||||
add sp, #76
|
||||
pop {r4-r11, pc}
|
||||
ENDPROC(chacha20_arm)
|
||||
|
||||
/*
|
||||
* void hchacha20_arm(const u32 state[16], u32 out[8]);
|
||||
*/
|
||||
ENTRY(hchacha20_arm)
|
||||
push {r1,r4-r11,lr}
|
||||
|
||||
mov r14, r0
|
||||
ldmia r14!, {r0-r11} // load x0-x11
|
||||
push {r10-r11} // store x10-x11 to stack
|
||||
ldm r14, {r10-r12,r14} // load x12-x15
|
||||
sub sp, #8
|
||||
|
||||
_chacha_permute 20
|
||||
|
||||
// Skip over (unused0-unused1, x10-x11)
|
||||
add sp, #16
|
||||
|
||||
// Fix up rotations of x12-x15
|
||||
ror X12, X12, #drot
|
||||
ror X13, X13, #drot
|
||||
pop {r4} // load 'out'
|
||||
ror X14, X14, #drot
|
||||
ror X15, X15, #drot
|
||||
|
||||
// Store (x0-x3,x12-x15) to 'out'
|
||||
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
|
||||
|
||||
pop {r4-r11,pc}
|
||||
ENDPROC(hchacha20_arm)
|
||||
File diff suppressed because it is too large
Load Diff
1272
src/crypto/zinc/poly1305/poly1305-arm.pl
Normal file
1272
src/crypto/zinc/poly1305/poly1305-arm.pl
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,824 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
/*
|
||||
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
|
||||
*
|
||||
* This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
.text
|
||||
|
||||
.align 5
|
||||
ENTRY(poly1305_init_arm)
|
||||
cmp x1,xzr
|
||||
stp xzr,xzr,[x0] // zero hash value
|
||||
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
ldp x7,x8,[x1] // load key
|
||||
mov x9,#0xfffffffc0fffffff
|
||||
movk x9,#0x0fff,lsl#48
|
||||
#ifdef __AARCH64EB__
|
||||
rev x7,x7 // flip bytes
|
||||
rev x8,x8
|
||||
#endif
|
||||
and x7,x7,x9 // &=0ffffffc0fffffff
|
||||
and x9,x9,#-4
|
||||
and x8,x8,x9 // &=0ffffffc0ffffffc
|
||||
stp x7,x8,[x0,#32] // save key value
|
||||
|
||||
.Lno_key:
|
||||
ret
|
||||
ENDPROC(poly1305_init_arm)
|
||||
|
||||
.align 5
|
||||
ENTRY(poly1305_blocks_arm)
|
||||
ands x2,x2,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp x4,x5,[x0] // load hash value
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
ldr x6,[x0,#16]
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
b .Loop
|
||||
|
||||
.align 5
|
||||
.Loop:
|
||||
ldp x10,x11,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
#ifdef __AARCH64EB__
|
||||
rev x10,x10
|
||||
rev x11,x11
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate input
|
||||
adcs x5,x5,x11
|
||||
|
||||
mul x12,x4,x7 // h0*r0
|
||||
adc x6,x6,x3
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
cbnz x2,.Loop
|
||||
|
||||
stp x4,x5,[x0] // store hash value
|
||||
str x6,[x0,#16]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
ENDPROC(poly1305_blocks_arm)
|
||||
|
||||
.align 5
|
||||
ENTRY(poly1305_emit_arm)
|
||||
ldp x4,x5,[x0] // load hash base 2^64
|
||||
ldr x6,[x0,#16]
|
||||
ldp x10,x11,[x2] // load nonce
|
||||
|
||||
adds x12,x4,#5 // compare to modulus
|
||||
adcs x13,x5,xzr
|
||||
adc x14,x6,xzr
|
||||
|
||||
tst x14,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel x4,x4,x12,eq
|
||||
csel x5,x5,x13,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror x10,x10,#32 // flip nonce words
|
||||
ror x11,x11,#32
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate nonce
|
||||
adc x5,x5,x11
|
||||
#ifdef __AARCH64EB__
|
||||
rev x4,x4 // flip output bytes
|
||||
rev x5,x5
|
||||
#endif
|
||||
stp x4,x5,[x1] // write result
|
||||
|
||||
ret
|
||||
ENDPROC(poly1305_emit_arm)
|
||||
|
||||
.align 5
|
||||
__poly1305_mult:
|
||||
mul x12,x4,x7 // h0*r0
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
ret
|
||||
|
||||
__poly1305_splat:
|
||||
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,x4,#26,#26
|
||||
extr x14,x5,x4,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,x5,#14,#26
|
||||
extr x16,x6,x5,#40
|
||||
|
||||
str w12,[x0,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[x0,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[x0,#16*2] // s1
|
||||
str w14,[x0,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[x0,#16*4] // s2
|
||||
str w15,[x0,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[x0,#16*6] // s3
|
||||
str w16,[x0,#16*7] // r4
|
||||
str w15,[x0,#16*8] // s4
|
||||
|
||||
ret
|
||||
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
.align 5
|
||||
ENTRY(poly1305_blocks_neon)
|
||||
ldr x17,[x0,#24]
|
||||
cmp x2,#128
|
||||
b.hs .Lblocks_neon
|
||||
cbz x17,poly1305_blocks_arm
|
||||
|
||||
.Lblocks_neon:
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
ands x2,x2,#-16
|
||||
b.eq .Lno_data_neon
|
||||
|
||||
cbz x17,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[x0] // load hash value base 2^26
|
||||
ldp w12,w13,[x0,#8]
|
||||
ldr w14,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x5,x12,#12
|
||||
adds x4,x4,x12,lsl#52
|
||||
add x5,x5,x13,lsl#14
|
||||
adc x5,x5,xzr
|
||||
lsr x6,x14,#24
|
||||
adds x5,x5,x14,lsl#40
|
||||
adc x14,x6,xzr // can be partially reduced...
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
and x10,x14,#-4 // ... so reduce
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x4,x10
|
||||
adcs x5,x5,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl __poly1305_mult
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
cbz x3,.Lstore_base2_64_neon
|
||||
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
cbnz x2,.Leven_neon
|
||||
|
||||
stp w10,w11,[x0] // store hash value base 2^26
|
||||
stp w12,w13,[x0,#8]
|
||||
str w14,[x0,#16]
|
||||
b .Lno_data_neon
|
||||
|
||||
.align 4
|
||||
.Lstore_base2_64_neon:
|
||||
stp x4,x5,[x0] // store hash value base 2^64
|
||||
stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
|
||||
b .Lno_data_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
ldp x4,x5,[x0] // load hash value base 2^64
|
||||
ldr x6,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __AARCH64EB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl __poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov x4,x7 // r^1
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov x5,x8
|
||||
mov x6,xzr
|
||||
add x0,x0,#48+12
|
||||
bl __poly1305_splat
|
||||
|
||||
bl __poly1305_mult // r^2
|
||||
sub x0,x0,#4
|
||||
bl __poly1305_splat
|
||||
|
||||
bl __poly1305_mult // r^3
|
||||
sub x0,x0,#4
|
||||
bl __poly1305_splat
|
||||
|
||||
bl __poly1305_mult // r^4
|
||||
sub x0,x0,#4
|
||||
bl __poly1305_splat
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
add x16,x1,#32
|
||||
adr x17,.Lzeros
|
||||
subs x2,x2,#64
|
||||
csel x16,x17,x16,lo
|
||||
|
||||
mov x4,#1
|
||||
str x4,[x0,#-24] // set is_base2_26
|
||||
sub x0,x0,#48 // restore original x0
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
add x16,x1,#32
|
||||
adr x17,.Lzeros
|
||||
subs x2,x2,#64
|
||||
csel x16,x17,x16,lo
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||
ldp x9,x13,[x16],#48
|
||||
|
||||
lsl x3,x3,#24
|
||||
add x15,x0,#48
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d14,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d15,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov d16,x8
|
||||
fmov d17,x10
|
||||
fmov d18,x12
|
||||
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
ldp x9,x13,[x1],#48
|
||||
|
||||
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
|
||||
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
|
||||
ld1 {v8.4s},[x15]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d9,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d10,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi v31.2d,#-1
|
||||
fmov d11,x8
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
ushr v31.2d,v31.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// ___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// ___________________/ ____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs x2,x2,#64
|
||||
umull v23.2d,v14.2s,v7.s[2]
|
||||
csel x16,x17,x16,lo
|
||||
umull v22.2d,v14.2s,v5.s[2]
|
||||
umull v21.2d,v14.2s,v3.s[2]
|
||||
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||
umull v20.2d,v14.2s,v1.s[2]
|
||||
ldp x9,x13,[x16],#48
|
||||
umull v19.2d,v14.2s,v0.s[2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal v23.2d,v15.2s,v5.s[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v22.2d,v15.2s,v3.s[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v21.2d,v15.2s,v1.s[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v15.2s,v0.s[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal v19.2d,v15.2s,v8.s[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal v23.2d,v16.2s,v3.s[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v22.2d,v16.2s,v1.s[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v21.2d,v16.2s,v0.s[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v20.2d,v16.2s,v8.s[2]
|
||||
fmov d14,x4
|
||||
umlal v19.2d,v16.2s,v6.s[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal v23.2d,v17.2s,v1.s[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v17.2s,v0.s[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v21.2d,v17.2s,v8.s[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v20.2d,v17.2s,v6.s[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v19.2d,v17.2s,v4.s[2]
|
||||
fmov d15,x6
|
||||
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
add x12,x3,x12,lsr#40
|
||||
umlal v23.2d,v18.2s,v0.s[2]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v22.2d,v18.2s,v8.s[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v21.2d,v18.2s,v6.s[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v18.2s,v4.s[2]
|
||||
fmov d16,x8
|
||||
umlal v19.2d,v18.2s,v2.s[2]
|
||||
fmov d17,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
fmov d18,x12
|
||||
umlal v22.2d,v11.2s,v1.s[0]
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
umlal v19.2d,v11.2s,v6.s[0]
|
||||
ldp x9,x13,[x1],#48
|
||||
umlal v23.2d,v11.2s,v3.s[0]
|
||||
umlal v20.2d,v11.2s,v8.s[0]
|
||||
umlal v21.2d,v11.2s,v0.s[0]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.s[0]
|
||||
umlal v23.2d,v9.2s,v7.s[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v21.2d,v9.2s,v3.s[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v19.2d,v9.2s,v0.s[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v9.2s,v1.s[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal v22.2d,v10.2s,v3.s[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v23.2d,v10.2s,v5.s[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v19.2d,v10.2s,v8.s[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v21.2d,v10.2s,v1.s[0]
|
||||
fmov d9,x4
|
||||
umlal v20.2d,v10.2s,v0.s[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v12.2s,v0.s[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v19.2d,v12.2s,v4.s[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v23.2d,v12.2s,v1.s[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v20.2d,v12.2s,v6.s[0]
|
||||
fmov d10,x6
|
||||
umlal v21.2d,v12.2s,v8.s[0]
|
||||
add x12,x3,x12,lsr#40
|
||||
|
||||
umlal v22.2d,v13.2s,v8.s[0]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v19.2d,v13.2s,v2.s[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v23.2d,v13.2s,v0.s[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v13.2s,v4.s[0]
|
||||
fmov d11,x8
|
||||
umlal v21.2d,v13.2s,v6.s[0]
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
xtn v27.2s,v22.2d
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
xtn v28.2s,v23.2d
|
||||
ushr v30.2d,v20.2d,#26
|
||||
xtn v25.2s,v20.2d
|
||||
bic v28.2s,#0xfc,lsl#24
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
shrn v30.2s,v21.2d,#26
|
||||
xtn v26.2s,v21.2d
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
bic v25.2s,#0xfc,lsl#24
|
||||
add v27.2s,v27.2s,v30.2s // h2 -> h3
|
||||
bic v26.2s,#0xfc,lsl#24
|
||||
|
||||
shrn v29.2s,v19.2d,#26
|
||||
xtn v24.2s,v19.2d
|
||||
ushr v30.2s,v27.2s,#26
|
||||
bic v27.2s,#0xfc,lsl#24
|
||||
bic v24.2s,#0xfc,lsl#24
|
||||
add v25.2s,v25.2s,v29.2s // h0 -> h1
|
||||
add v28.2s,v28.2s,v30.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup v16.2d,v16.d[0]
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds x2,x2,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup v16.2d,v11.d[0]
|
||||
add v14.2s,v9.2s,v24.2s
|
||||
add v17.2s,v12.2s,v27.2s
|
||||
add v15.2s,v10.2s,v25.2s
|
||||
add v18.2s,v13.2s,v28.2s
|
||||
|
||||
.Long_tail:
|
||||
dup v14.2d,v14.d[0]
|
||||
umull2 v19.2d,v16.4s,v6.4s
|
||||
umull2 v22.2d,v16.4s,v1.4s
|
||||
umull2 v23.2d,v16.4s,v3.4s
|
||||
umull2 v21.2d,v16.4s,v0.4s
|
||||
umull2 v20.2d,v16.4s,v8.4s
|
||||
|
||||
dup v15.2d,v15.d[0]
|
||||
umlal2 v19.2d,v14.4s,v0.4s
|
||||
umlal2 v21.2d,v14.4s,v3.4s
|
||||
umlal2 v22.2d,v14.4s,v5.4s
|
||||
umlal2 v23.2d,v14.4s,v7.4s
|
||||
umlal2 v20.2d,v14.4s,v1.4s
|
||||
|
||||
dup v17.2d,v17.d[0]
|
||||
umlal2 v19.2d,v15.4s,v8.4s
|
||||
umlal2 v22.2d,v15.4s,v3.4s
|
||||
umlal2 v21.2d,v15.4s,v1.4s
|
||||
umlal2 v23.2d,v15.4s,v5.4s
|
||||
umlal2 v20.2d,v15.4s,v0.4s
|
||||
|
||||
dup v18.2d,v18.d[0]
|
||||
umlal2 v22.2d,v17.4s,v0.4s
|
||||
umlal2 v23.2d,v17.4s,v1.4s
|
||||
umlal2 v19.2d,v17.4s,v4.4s
|
||||
umlal2 v20.2d,v17.4s,v6.4s
|
||||
umlal2 v21.2d,v17.4s,v8.4s
|
||||
|
||||
umlal2 v22.2d,v18.4s,v8.4s
|
||||
umlal2 v19.2d,v18.4s,v2.4s
|
||||
umlal2 v23.2d,v18.4s,v0.4s
|
||||
umlal2 v20.2d,v18.4s,v4.4s
|
||||
umlal2 v21.2d,v18.4s,v6.4s
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
umlal v22.2d,v11.2s,v1.2s
|
||||
umlal v19.2d,v11.2s,v6.2s
|
||||
umlal v23.2d,v11.2s,v3.2s
|
||||
umlal v20.2d,v11.2s,v8.2s
|
||||
umlal v21.2d,v11.2s,v0.2s
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.2s
|
||||
umlal v19.2d,v9.2s,v0.2s
|
||||
umlal v23.2d,v9.2s,v7.2s
|
||||
umlal v20.2d,v9.2s,v1.2s
|
||||
umlal v21.2d,v9.2s,v3.2s
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
umlal v22.2d,v10.2s,v3.2s
|
||||
umlal v19.2d,v10.2s,v8.2s
|
||||
umlal v23.2d,v10.2s,v5.2s
|
||||
umlal v20.2d,v10.2s,v0.2s
|
||||
umlal v21.2d,v10.2s,v1.2s
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
umlal v22.2d,v12.2s,v0.2s
|
||||
umlal v19.2d,v12.2s,v4.2s
|
||||
umlal v23.2d,v12.2s,v1.2s
|
||||
umlal v20.2d,v12.2s,v6.2s
|
||||
umlal v21.2d,v12.2s,v8.2s
|
||||
|
||||
umlal v22.2d,v13.2s,v8.2s
|
||||
umlal v19.2d,v13.2s,v2.2s
|
||||
umlal v23.2d,v13.2s,v0.2s
|
||||
umlal v20.2d,v13.2s,v4.2s
|
||||
umlal v21.2d,v13.2s,v6.2s
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp v22.2d,v22.2d,v22.2d
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp v19.2d,v19.2d,v19.2d
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp v23.2d,v23.2d,v23.2d
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp v20.2d,v20.2d,v20.2d
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp v21.2d,v21.2d,v21.2d
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
and v23.16b,v23.16b,v31.16b
|
||||
ushr v30.2d,v20.2d,#26
|
||||
and v20.16b,v20.16b,v31.16b
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
ushr v30.2d,v21.2d,#26
|
||||
and v21.16b,v21.16b,v31.16b
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
add v22.2d,v22.2d,v30.2d // h2 -> h3
|
||||
|
||||
ushr v29.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
ushr v30.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
add v20.2d,v20.2d,v29.2d // h0 -> h1
|
||||
add v23.2d,v23.2d,v30.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
|
||||
st1 {v23.s}[0],[x0]
|
||||
|
||||
.Lno_data_neon:
|
||||
ldr x29,[sp],#80
|
||||
ret
|
||||
ENDPROC(poly1305_blocks_neon)
|
||||
|
||||
.align 5
|
||||
ENTRY(poly1305_emit_neon)
|
||||
ldr x17,[x0,#24]
|
||||
cbz x17,poly1305_emit_arm
|
||||
|
||||
ldp w10,w11,[x0] // load hash value base 2^26
|
||||
ldp w12,w13,[x0,#8]
|
||||
ldr w14,[x0,#16]
|
||||
|
||||
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x5,x12,#12
|
||||
adds x4,x4,x12,lsl#52
|
||||
add x5,x5,x13,lsl#14
|
||||
adc x5,x5,xzr
|
||||
lsr x6,x14,#24
|
||||
adds x5,x5,x14,lsl#40
|
||||
adc x6,x6,xzr // can be partially reduced...
|
||||
|
||||
ldp x10,x11,[x2] // load nonce
|
||||
|
||||
and x12,x6,#-4 // ... so reduce
|
||||
add x12,x12,x6,lsr#2
|
||||
and x6,x6,#3
|
||||
adds x4,x4,x12
|
||||
adcs x5,x5,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
adds x12,x4,#5 // compare to modulus
|
||||
adcs x13,x5,xzr
|
||||
adc x14,x6,xzr
|
||||
|
||||
tst x14,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel x4,x4,x12,eq
|
||||
csel x5,x5,x13,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror x10,x10,#32 // flip nonce words
|
||||
ror x11,x11,#32
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate nonce
|
||||
adc x5,x5,x11
|
||||
#ifdef __AARCH64EB__
|
||||
rev x4,x4 // flip output bytes
|
||||
rev x5,x5
|
||||
#endif
|
||||
stp x4,x5,[x1] // write result
|
||||
|
||||
ret
|
||||
ENDPROC(poly1305_emit_neon)
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
#endif
|
||||
972
src/crypto/zinc/poly1305/poly1305-arm64.pl
Normal file
972
src/crypto/zinc/poly1305/poly1305-arm64.pl
Normal file
@@ -0,0 +1,972 @@
|
||||
#!/usr/bin/env perl
|
||||
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
#
|
||||
# This code is taken from the OpenSSL project but the author, Andy Polyakov,
|
||||
# has relicensed it under the licenses specified in the SPDX header above.
|
||||
# The original headers, including the original license headers, are
|
||||
# included below for completeness.
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# This module implements Poly1305 hash for ARMv8.
|
||||
#
|
||||
# June 2015
|
||||
#
|
||||
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
||||
#
|
||||
# IALU/gcc-4.9 NEON
|
||||
#
|
||||
# Apple A7 1.86/+5% 0.72
|
||||
# Cortex-A53 2.69/+58% 1.47
|
||||
# Cortex-A57 2.70/+7% 1.14
|
||||
# Denver 1.64/+50% 1.18(*)
|
||||
# X-Gene 2.13/+68% 2.27
|
||||
# Mongoose 1.77/+75% 1.12
|
||||
# Kryo 2.70/+55% 1.13
|
||||
#
|
||||
# (*) estimate based on resources availability is less than 1.0,
|
||||
# i.e. measured result is worse than expected, presumably binary
|
||||
# translator is not almighty;
|
||||
|
||||
$flavour=shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
|
||||
my ($mac,$nonce)=($inp,$len);
|
||||
|
||||
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
|
||||
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#else
|
||||
# define poly1305_init poly1305_init_arm
|
||||
# define poly1305_blocks poly1305_blocks_arm
|
||||
# define poly1305_emit poly1305_emit_arm
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
// forward "declarations" are required for Apple
|
||||
.globl poly1305_blocks
|
||||
.globl poly1305_emit
|
||||
#ifdef __KERNEL__
|
||||
.globl poly1305_blocks_neon
|
||||
.globl poly1305_emit_neon
|
||||
#endif
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,%function
|
||||
.align 5
|
||||
poly1305_init:
|
||||
cmp $inp,xzr
|
||||
stp xzr,xzr,[$ctx] // zero hash value
|
||||
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifndef __KERNEL__
|
||||
# ifdef __ILP32__
|
||||
ldrsw $t1,.LOPENSSL_armcap_P
|
||||
# else
|
||||
ldr $t1,.LOPENSSL_armcap_P
|
||||
# endif
|
||||
adr $t0,.LOPENSSL_armcap_P
|
||||
ldr w17,[$t0,$t1]
|
||||
#endif
|
||||
|
||||
ldp $r0,$r1,[$inp] // load key
|
||||
mov $s1,#0xfffffffc0fffffff
|
||||
movk $s1,#0x0fff,lsl#48
|
||||
#ifdef __AARCH64EB__
|
||||
rev $r0,$r0 // flip bytes
|
||||
rev $r1,$r1
|
||||
#endif
|
||||
and $r0,$r0,$s1 // &=0ffffffc0fffffff
|
||||
and $s1,$s1,#-4
|
||||
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
|
||||
stp $r0,$r1,[$ctx,#32] // save key value
|
||||
|
||||
#ifndef __KERNEL__
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr $d0,poly1305_blocks
|
||||
adr $r0,poly1305_blocks_neon
|
||||
adr $d1,poly1305_emit
|
||||
adr $r1,poly1305_emit_neon
|
||||
|
||||
csel $d0,$d0,$r0,eq
|
||||
csel $d1,$d1,$r1,eq
|
||||
|
||||
# ifdef __ILP32__
|
||||
stp w12,w13,[$len]
|
||||
# else
|
||||
stp $d0,$d1,[$len]
|
||||
# endif
|
||||
|
||||
mov x0,#1
|
||||
#else
|
||||
mov x0,#0
|
||||
#endif
|
||||
.Lno_key:
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
ands $len,$len,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp $h0,$h1,[$ctx] // load hash value
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
ldr $h2,[$ctx,#16]
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
b .Loop
|
||||
|
||||
.align 5
|
||||
.Loop:
|
||||
ldp $t0,$t1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
#ifdef __AARCH64EB__
|
||||
rev $t0,$t0
|
||||
rev $t1,$t1
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate input
|
||||
adcs $h1,$h1,$t1
|
||||
|
||||
mul $d0,$h0,$r0 // h0*r0
|
||||
adc $h2,$h2,$padbit
|
||||
umulh $d1,$h0,$r0
|
||||
|
||||
mul $t0,$h1,$s1 // h1*5*r1
|
||||
umulh $t1,$h1,$s1
|
||||
|
||||
adds $d0,$d0,$t0
|
||||
mul $t0,$h0,$r1 // h0*r1
|
||||
adc $d1,$d1,$t1
|
||||
umulh $d2,$h0,$r1
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h1,$r0 // h1*r0
|
||||
adc $d2,$d2,xzr
|
||||
umulh $t1,$h1,$r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h2,$s1 // h2*5*r1
|
||||
adc $d2,$d2,$t1
|
||||
mul $t1,$h2,$r0 // h2*r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
adc $d2,$d2,$t1
|
||||
|
||||
and $t0,$d2,#-4 // final reduction
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
cbnz $len,.Loop
|
||||
|
||||
stp $h0,$h1,[$ctx] // store hash value
|
||||
str $h2,[$ctx,#16]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
ldp $h0,$h1,[$ctx] // load hash base 2^64
|
||||
ldr $h2,[$ctx,#16]
|
||||
ldp $t0,$t1,[$nonce] // load nonce
|
||||
|
||||
adds $d0,$h0,#5 // compare to modulus
|
||||
adcs $d1,$h1,xzr
|
||||
adc $d2,$h2,xzr
|
||||
|
||||
tst $d2,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel $h0,$h0,$d0,eq
|
||||
csel $h1,$h1,$d1,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror $t0,$t0,#32 // flip nonce words
|
||||
ror $t1,$t1,#32
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate nonce
|
||||
adc $h1,$h1,$t1
|
||||
#ifdef __AARCH64EB__
|
||||
rev $h0,$h0 // flip output bytes
|
||||
rev $h1,$h1
|
||||
#endif
|
||||
stp $h0,$h1,[$mac] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
___
|
||||
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
|
||||
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
|
||||
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
|
||||
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
|
||||
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
|
||||
my ($T0,$T1,$MASK) = map("v$_",(29..31));
|
||||
|
||||
my ($in2,$zeros)=("x16","x17");
|
||||
my $is_base2_26 = $zeros; # borrow
|
||||
|
||||
$code.=<<___;
|
||||
.type __poly1305_mult,%function
|
||||
.align 5
|
||||
__poly1305_mult:
|
||||
mul $d0,$h0,$r0 // h0*r0
|
||||
umulh $d1,$h0,$r0
|
||||
|
||||
mul $t0,$h1,$s1 // h1*5*r1
|
||||
umulh $t1,$h1,$s1
|
||||
|
||||
adds $d0,$d0,$t0
|
||||
mul $t0,$h0,$r1 // h0*r1
|
||||
adc $d1,$d1,$t1
|
||||
umulh $d2,$h0,$r1
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h1,$r0 // h1*r0
|
||||
adc $d2,$d2,xzr
|
||||
umulh $t1,$h1,$r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h2,$s1 // h2*5*r1
|
||||
adc $d2,$d2,$t1
|
||||
mul $t1,$h2,$r0 // h2*r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
adc $d2,$d2,$t1
|
||||
|
||||
and $t0,$d2,#-4 // final reduction
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
ret
|
||||
.size __poly1305_mult,.-__poly1305_mult
|
||||
|
||||
.type __poly1305_splat,%function
|
||||
.align 5
|
||||
__poly1305_splat:
|
||||
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,$h0,#26,#26
|
||||
extr x14,$h1,$h0,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,$h1,#14,#26
|
||||
extr x16,$h2,$h1,#40
|
||||
|
||||
str w12,[$ctx,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[$ctx,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[$ctx,#16*2] // s1
|
||||
str w14,[$ctx,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[$ctx,#16*4] // s2
|
||||
str w15,[$ctx,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[$ctx,#16*6] // s3
|
||||
str w16,[$ctx,#16*7] // r4
|
||||
str w15,[$ctx,#16*8] // s4
|
||||
|
||||
ret
|
||||
.size __poly1305_splat,.-__poly1305_splat
|
||||
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
ldr $is_base2_26,[$ctx,#24]
|
||||
cmp $len,#128
|
||||
b.hs .Lblocks_neon
|
||||
cbz $is_base2_26,poly1305_blocks
|
||||
|
||||
.Lblocks_neon:
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
ands $len,$len,#-16
|
||||
b.eq .Lno_data_neon
|
||||
|
||||
cbz $is_base2_26,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[$ctx] // load hash value base 2^26
|
||||
ldp w12,w13,[$ctx,#8]
|
||||
ldr w14,[$ctx,#16]
|
||||
|
||||
tst $len,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $h1,x12,#12
|
||||
adds $h0,$h0,x12,lsl#52
|
||||
add $h1,$h1,x13,lsl#14
|
||||
adc $h1,$h1,xzr
|
||||
lsr $h2,x14,#24
|
||||
adds $h1,$h1,x14,lsl#40
|
||||
adc $d2,$h2,xzr // can be partially reduced...
|
||||
|
||||
ldp $d0,$d1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
and $t0,$d2,#-4 // ... so reduce
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$h0,$t0
|
||||
adcs $h1,$h1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev $d0,$d0
|
||||
rev $d1,$d1
|
||||
#endif
|
||||
adds $h0,$h0,$d0 // accumulate input
|
||||
adcs $h1,$h1,$d1
|
||||
adc $h2,$h2,$padbit
|
||||
|
||||
bl __poly1305_mult
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
cbz $padbit,.Lstore_base2_64_neon
|
||||
|
||||
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,$h0,#26,#26
|
||||
extr x12,$h1,$h0,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,$h1,#14,#26
|
||||
extr x14,$h2,$h1,#40
|
||||
|
||||
cbnz $len,.Leven_neon
|
||||
|
||||
stp w10,w11,[$ctx] // store hash value base 2^26
|
||||
stp w12,w13,[$ctx,#8]
|
||||
str w14,[$ctx,#16]
|
||||
b .Lno_data_neon
|
||||
|
||||
.align 4
|
||||
.Lstore_base2_64_neon:
|
||||
stp $h0,$h1,[$ctx] // store hash value base 2^64
|
||||
stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
|
||||
b .Lno_data_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
ldp $h0,$h1,[$ctx] // load hash value base 2^64
|
||||
ldr $h2,[$ctx,#16]
|
||||
|
||||
tst $len,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp $d0,$d1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __AARCH64EB__
|
||||
rev $d0,$d0
|
||||
rev $d1,$d1
|
||||
#endif
|
||||
adds $h0,$h0,$d0 // accumulate input
|
||||
adcs $h1,$h1,$d1
|
||||
adc $h2,$h2,$padbit
|
||||
|
||||
bl __poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,$h0,#26,#26
|
||||
extr x12,$h1,$h0,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,$h1,#14,#26
|
||||
extr x14,$h2,$h1,#40
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
fmov ${H0},x10
|
||||
fmov ${H1},x11
|
||||
fmov ${H2},x12
|
||||
fmov ${H3},x13
|
||||
fmov ${H4},x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov $h0,$r0 // r^1
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov $h1,$r1
|
||||
mov $h2,xzr
|
||||
add $ctx,$ctx,#48+12
|
||||
bl __poly1305_splat
|
||||
|
||||
bl __poly1305_mult // r^2
|
||||
sub $ctx,$ctx,#4
|
||||
bl __poly1305_splat
|
||||
|
||||
bl __poly1305_mult // r^3
|
||||
sub $ctx,$ctx,#4
|
||||
bl __poly1305_splat
|
||||
|
||||
bl __poly1305_mult // r^4
|
||||
sub $ctx,$ctx,#4
|
||||
bl __poly1305_splat
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
add $in2,$inp,#32
|
||||
adr $zeros,.Lzeros
|
||||
subs $len,$len,#64
|
||||
csel $in2,$zeros,$in2,lo
|
||||
|
||||
mov x4,#1
|
||||
str x4,[$ctx,#-24] // set is_base2_26
|
||||
sub $ctx,$ctx,#48 // restore original $ctx
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
add $in2,$inp,#32
|
||||
adr $zeros,.Lzeros
|
||||
subs $len,$len,#64
|
||||
csel $in2,$zeros,$in2,lo
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
fmov ${H0},x10
|
||||
fmov ${H1},x11
|
||||
fmov ${H2},x12
|
||||
fmov ${H3},x13
|
||||
fmov ${H4},x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
||||
ldp x9,x13,[$in2],#48
|
||||
|
||||
lsl $padbit,$padbit,#24
|
||||
add x15,$ctx,#48
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov $IN23_0,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,$padbit,x12,lsr#40
|
||||
add x13,$padbit,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov $IN23_1,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov $IN23_2,x8
|
||||
fmov $IN23_3,x10
|
||||
fmov $IN23_4,x12
|
||||
|
||||
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||
ldp x9,x13,[$inp],#48
|
||||
|
||||
ld1 {$R0,$R1,$S1,$R2},[x15],#64
|
||||
ld1 {$S2,$R3,$S3,$R4},[x15],#64
|
||||
ld1 {$S4},[x15]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov $IN01_0,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,$padbit,x12,lsr#40
|
||||
add x13,$padbit,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov $IN01_1,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi $MASK.2d,#-1
|
||||
fmov $IN01_2,x8
|
||||
fmov $IN01_3,x10
|
||||
fmov $IN01_4,x12
|
||||
ushr $MASK.2d,$MASK.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// \___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// \___________________/ \____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs $len,$len,#64
|
||||
umull $ACC4,$IN23_0,${R4}[2]
|
||||
csel $in2,$zeros,$in2,lo
|
||||
umull $ACC3,$IN23_0,${R3}[2]
|
||||
umull $ACC2,$IN23_0,${R2}[2]
|
||||
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
||||
umull $ACC1,$IN23_0,${R1}[2]
|
||||
ldp x9,x13,[$in2],#48
|
||||
umull $ACC0,$IN23_0,${R0}[2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal $ACC4,$IN23_1,${R3}[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal $ACC3,$IN23_1,${R2}[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal $ACC2,$IN23_1,${R1}[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal $ACC1,$IN23_1,${R0}[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal $ACC0,$IN23_1,${S4}[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal $ACC4,$IN23_2,${R2}[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal $ACC3,$IN23_2,${R1}[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal $ACC2,$IN23_2,${R0}[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal $ACC1,$IN23_2,${S4}[2]
|
||||
fmov $IN23_0,x4
|
||||
umlal $ACC0,$IN23_2,${S3}[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal $ACC4,$IN23_3,${R1}[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal $ACC3,$IN23_3,${R0}[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal $ACC2,$IN23_3,${S4}[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal $ACC1,$IN23_3,${S3}[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal $ACC0,$IN23_3,${S2}[2]
|
||||
fmov $IN23_1,x6
|
||||
|
||||
add $IN01_2,$IN01_2,$H2
|
||||
add x12,$padbit,x12,lsr#40
|
||||
umlal $ACC4,$IN23_4,${R0}[2]
|
||||
add x13,$padbit,x13,lsr#40
|
||||
umlal $ACC3,$IN23_4,${S4}[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal $ACC2,$IN23_4,${S3}[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal $ACC1,$IN23_4,${S2}[2]
|
||||
fmov $IN23_2,x8
|
||||
umlal $ACC0,$IN23_4,${S1}[2]
|
||||
fmov $IN23_3,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add $IN01_0,$IN01_0,$H0
|
||||
fmov $IN23_4,x12
|
||||
umlal $ACC3,$IN01_2,${R1}[0]
|
||||
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||
umlal $ACC0,$IN01_2,${S3}[0]
|
||||
ldp x9,x13,[$inp],#48
|
||||
umlal $ACC4,$IN01_2,${R2}[0]
|
||||
umlal $ACC1,$IN01_2,${S4}[0]
|
||||
umlal $ACC2,$IN01_2,${R0}[0]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add $IN01_1,$IN01_1,$H1
|
||||
umlal $ACC3,$IN01_0,${R3}[0]
|
||||
umlal $ACC4,$IN01_0,${R4}[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal $ACC2,$IN01_0,${R2}[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal $ACC0,$IN01_0,${R0}[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal $ACC1,$IN01_0,${R1}[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add $IN01_3,$IN01_3,$H3
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal $ACC3,$IN01_1,${R2}[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal $ACC4,$IN01_1,${R3}[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal $ACC0,$IN01_1,${S4}[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal $ACC2,$IN01_1,${R1}[0]
|
||||
fmov $IN01_0,x4
|
||||
umlal $ACC1,$IN01_1,${R0}[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add $IN01_4,$IN01_4,$H4
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal $ACC3,$IN01_3,${R0}[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal $ACC0,$IN01_3,${S2}[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal $ACC4,$IN01_3,${R1}[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal $ACC1,$IN01_3,${S3}[0]
|
||||
fmov $IN01_1,x6
|
||||
umlal $ACC2,$IN01_3,${S4}[0]
|
||||
add x12,$padbit,x12,lsr#40
|
||||
|
||||
umlal $ACC3,$IN01_4,${S4}[0]
|
||||
add x13,$padbit,x13,lsr#40
|
||||
umlal $ACC0,$IN01_4,${S1}[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal $ACC4,$IN01_4,${R0}[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal $ACC1,$IN01_4,${S2}[0]
|
||||
fmov $IN01_2,x8
|
||||
umlal $ACC2,$IN01_4,${S3}[0]
|
||||
fmov $IN01_3,x10
|
||||
fmov $IN01_4,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr $T0.2d,$ACC3,#26
|
||||
xtn $H3,$ACC3
|
||||
ushr $T1.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||
|
||||
ushr $T0.2d,$ACC4,#26
|
||||
xtn $H4,$ACC4
|
||||
ushr $T1.2d,$ACC1,#26
|
||||
xtn $H1,$ACC1
|
||||
bic $H4,#0xfc,lsl#24
|
||||
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||
|
||||
add $ACC0,$ACC0,$T0.2d
|
||||
shl $T0.2d,$T0.2d,#2
|
||||
shrn $T1.2s,$ACC2,#26
|
||||
xtn $H2,$ACC2
|
||||
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||
bic $H1,#0xfc,lsl#24
|
||||
add $H3,$H3,$T1.2s // h2 -> h3
|
||||
bic $H2,#0xfc,lsl#24
|
||||
|
||||
shrn $T0.2s,$ACC0,#26
|
||||
xtn $H0,$ACC0
|
||||
ushr $T1.2s,$H3,#26
|
||||
bic $H3,#0xfc,lsl#24
|
||||
bic $H0,#0xfc,lsl#24
|
||||
add $H1,$H1,$T0.2s // h0 -> h1
|
||||
add $H4,$H4,$T1.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup $IN23_2,${IN23_2}[0]
|
||||
add $IN01_2,$IN01_2,$H2
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds $len,$len,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup $IN23_2,${IN01_2}[0]
|
||||
add $IN23_0,$IN01_0,$H0
|
||||
add $IN23_3,$IN01_3,$H3
|
||||
add $IN23_1,$IN01_1,$H1
|
||||
add $IN23_4,$IN01_4,$H4
|
||||
|
||||
.Long_tail:
|
||||
dup $IN23_0,${IN23_0}[0]
|
||||
umull2 $ACC0,$IN23_2,${S3}
|
||||
umull2 $ACC3,$IN23_2,${R1}
|
||||
umull2 $ACC4,$IN23_2,${R2}
|
||||
umull2 $ACC2,$IN23_2,${R0}
|
||||
umull2 $ACC1,$IN23_2,${S4}
|
||||
|
||||
dup $IN23_1,${IN23_1}[0]
|
||||
umlal2 $ACC0,$IN23_0,${R0}
|
||||
umlal2 $ACC2,$IN23_0,${R2}
|
||||
umlal2 $ACC3,$IN23_0,${R3}
|
||||
umlal2 $ACC4,$IN23_0,${R4}
|
||||
umlal2 $ACC1,$IN23_0,${R1}
|
||||
|
||||
dup $IN23_3,${IN23_3}[0]
|
||||
umlal2 $ACC0,$IN23_1,${S4}
|
||||
umlal2 $ACC3,$IN23_1,${R2}
|
||||
umlal2 $ACC2,$IN23_1,${R1}
|
||||
umlal2 $ACC4,$IN23_1,${R3}
|
||||
umlal2 $ACC1,$IN23_1,${R0}
|
||||
|
||||
dup $IN23_4,${IN23_4}[0]
|
||||
umlal2 $ACC3,$IN23_3,${R0}
|
||||
umlal2 $ACC4,$IN23_3,${R1}
|
||||
umlal2 $ACC0,$IN23_3,${S2}
|
||||
umlal2 $ACC1,$IN23_3,${S3}
|
||||
umlal2 $ACC2,$IN23_3,${S4}
|
||||
|
||||
umlal2 $ACC3,$IN23_4,${S4}
|
||||
umlal2 $ACC0,$IN23_4,${S1}
|
||||
umlal2 $ACC4,$IN23_4,${R0}
|
||||
umlal2 $ACC1,$IN23_4,${S2}
|
||||
umlal2 $ACC2,$IN23_4,${S3}
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add $IN01_0,$IN01_0,$H0
|
||||
umlal $ACC3,$IN01_2,${R1}
|
||||
umlal $ACC0,$IN01_2,${S3}
|
||||
umlal $ACC4,$IN01_2,${R2}
|
||||
umlal $ACC1,$IN01_2,${S4}
|
||||
umlal $ACC2,$IN01_2,${R0}
|
||||
|
||||
add $IN01_1,$IN01_1,$H1
|
||||
umlal $ACC3,$IN01_0,${R3}
|
||||
umlal $ACC0,$IN01_0,${R0}
|
||||
umlal $ACC4,$IN01_0,${R4}
|
||||
umlal $ACC1,$IN01_0,${R1}
|
||||
umlal $ACC2,$IN01_0,${R2}
|
||||
|
||||
add $IN01_3,$IN01_3,$H3
|
||||
umlal $ACC3,$IN01_1,${R2}
|
||||
umlal $ACC0,$IN01_1,${S4}
|
||||
umlal $ACC4,$IN01_1,${R3}
|
||||
umlal $ACC1,$IN01_1,${R0}
|
||||
umlal $ACC2,$IN01_1,${R1}
|
||||
|
||||
add $IN01_4,$IN01_4,$H4
|
||||
umlal $ACC3,$IN01_3,${R0}
|
||||
umlal $ACC0,$IN01_3,${S2}
|
||||
umlal $ACC4,$IN01_3,${R1}
|
||||
umlal $ACC1,$IN01_3,${S3}
|
||||
umlal $ACC2,$IN01_3,${S4}
|
||||
|
||||
umlal $ACC3,$IN01_4,${S4}
|
||||
umlal $ACC0,$IN01_4,${S1}
|
||||
umlal $ACC4,$IN01_4,${R0}
|
||||
umlal $ACC1,$IN01_4,${S2}
|
||||
umlal $ACC2,$IN01_4,${S3}
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp $ACC3,$ACC3,$ACC3
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp $ACC0,$ACC0,$ACC0
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp $ACC4,$ACC4,$ACC4
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp $ACC1,$ACC1,$ACC1
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp $ACC2,$ACC2,$ACC2
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr $T0.2d,$ACC3,#26
|
||||
and $ACC3,$ACC3,$MASK.2d
|
||||
ushr $T1.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
|
||||
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||
|
||||
ushr $T0.2d,$ACC4,#26
|
||||
and $ACC4,$ACC4,$MASK.2d
|
||||
ushr $T1.2d,$ACC1,#26
|
||||
and $ACC1,$ACC1,$MASK.2d
|
||||
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||
|
||||
add $ACC0,$ACC0,$T0.2d
|
||||
shl $T0.2d,$T0.2d,#2
|
||||
ushr $T1.2d,$ACC2,#26
|
||||
and $ACC2,$ACC2,$MASK.2d
|
||||
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||
add $ACC3,$ACC3,$T1.2d // h2 -> h3
|
||||
|
||||
ushr $T0.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
ushr $T1.2d,$ACC3,#26
|
||||
and $ACC3,$ACC3,$MASK.2d
|
||||
add $ACC1,$ACC1,$T0.2d // h0 -> h1
|
||||
add $ACC4,$ACC4,$T1.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
|
||||
st1 {$ACC4}[0],[$ctx]
|
||||
|
||||
.Lno_data_neon:
|
||||
ldr x29,[sp],#80
|
||||
ret
|
||||
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||
|
||||
.type poly1305_emit_neon,%function
|
||||
.align 5
|
||||
poly1305_emit_neon:
|
||||
ldr $is_base2_26,[$ctx,#24]
|
||||
cbz $is_base2_26,poly1305_emit
|
||||
|
||||
ldp w10,w11,[$ctx] // load hash value base 2^26
|
||||
ldp w12,w13,[$ctx,#8]
|
||||
ldr w14,[$ctx,#16]
|
||||
|
||||
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $h1,x12,#12
|
||||
adds $h0,$h0,x12,lsl#52
|
||||
add $h1,$h1,x13,lsl#14
|
||||
adc $h1,$h1,xzr
|
||||
lsr $h2,x14,#24
|
||||
adds $h1,$h1,x14,lsl#40
|
||||
adc $h2,$h2,xzr // can be partially reduced...
|
||||
|
||||
ldp $t0,$t1,[$nonce] // load nonce
|
||||
|
||||
and $d0,$h2,#-4 // ... so reduce
|
||||
add $d0,$d0,$h2,lsr#2
|
||||
and $h2,$h2,#3
|
||||
adds $h0,$h0,$d0
|
||||
adcs $h1,$h1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
adds $d0,$h0,#5 // compare to modulus
|
||||
adcs $d1,$h1,xzr
|
||||
adc $d2,$h2,xzr
|
||||
|
||||
tst $d2,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel $h0,$h0,$d0,eq
|
||||
csel $h1,$h1,$d1,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror $t0,$t0,#32 // flip nonce words
|
||||
ror $t1,$t1,#32
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate nonce
|
||||
adc $h1,$h1,$t1
|
||||
#ifdef __AARCH64EB__
|
||||
rev $h0,$h0 // flip output bytes
|
||||
rev $h1,$h1
|
||||
#endif
|
||||
stp $h0,$h1,[$mac] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit_neon,.-poly1305_emit_neon
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
#ifndef __KERNEL__
|
||||
.LOPENSSL_armcap_P:
|
||||
#ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
#else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
#endif
|
||||
#endif
|
||||
.align 2
|
||||
___
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
next if (/^#!/);
|
||||
last if (!s/^#/\/\// and !/^$/);
|
||||
print;
|
||||
}
|
||||
close SELF;
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
|
||||
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
|
||||
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
|
||||
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
|
||||
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
|
||||
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
|
||||
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
|
||||
|
||||
s/\.[124]([sd])\[/.$1\[/;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
||||
@@ -1,360 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
|
||||
/*
|
||||
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
|
||||
*
|
||||
* This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
|
||||
*/
|
||||
|
||||
#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \
|
||||
defined(_MIPS_ARCH_MIPS64R6)) && !defined(_MIPS_ARCH_MIPS64R2)
|
||||
#define _MIPS_ARCH_MIPS64R2
|
||||
#endif
|
||||
|
||||
#ifdef __MIPSEB__
|
||||
#define MSB 0
|
||||
#define LSB 7
|
||||
#else
|
||||
#define MSB 7
|
||||
#define LSB 0
|
||||
#endif
|
||||
|
||||
#if defined(_MIPS_ARCH_MIPS64R6)
|
||||
#define dmultu(rs,rt)
|
||||
#define mflo(rd,rs,rt) dmulu rd,rs,rt
|
||||
#define mfhi(rd,rs,rt) dmuhu rd,rs,rt
|
||||
#else
|
||||
#define dmultu(rs,rt) dmultu rs,rt
|
||||
#define multu(rs,rt) multu rs,rt
|
||||
#define mflo(rd,rs,rt) mflo rd
|
||||
#define mfhi(rd,rs,rt) mfhi rd
|
||||
#endif
|
||||
|
||||
.text
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
/* While most of the assembly in the kernel prefers ENTRY() and ENDPROC(),
|
||||
* there is no existing MIPS assembly that uses it, and MIPS assembler seems
|
||||
* to like its own .ent/.end notation, which the MIPS include files don't
|
||||
* provide in a MIPS-specific ENTRY/ENDPROC definition. So, we skip these
|
||||
* for now, until somebody complains. */
|
||||
|
||||
.align 5
|
||||
.globl poly1305_init_mips
|
||||
.ent poly1305_init_mips
|
||||
poly1305_init_mips:
|
||||
.frame $29,0,$31
|
||||
.set reorder
|
||||
|
||||
sd $0,0($4)
|
||||
sd $0,8($4)
|
||||
sd $0,16($4)
|
||||
|
||||
beqz $5,.Lno_key
|
||||
|
||||
#if defined(_MIPS_ARCH_MIPS64R6)
|
||||
ld $8,0($5)
|
||||
ld $9,8($5)
|
||||
#else
|
||||
ldl $8,0+MSB($5)
|
||||
ldl $9,8+MSB($5)
|
||||
ldr $8,0+LSB($5)
|
||||
ldr $9,8+LSB($5)
|
||||
#endif
|
||||
#ifdef __MIPSEB__
|
||||
#if defined(_MIPS_ARCH_MIPS64R2)
|
||||
dsbh $8,$8 # byte swap
|
||||
dsbh $9,$9
|
||||
dshd $8,$8
|
||||
dshd $9,$9
|
||||
#else
|
||||
ori $10,$0,0xFF
|
||||
dsll $1,$10,32
|
||||
or $10,$1 # 0x000000FF000000FF
|
||||
|
||||
and $11,$8,$10 # byte swap
|
||||
and $2,$9,$10
|
||||
dsrl $1,$8,24
|
||||
dsrl $24,$9,24
|
||||
dsll $11,24
|
||||
dsll $2,24
|
||||
and $1,$10
|
||||
and $24,$10
|
||||
dsll $10,8 # 0x0000FF000000FF00
|
||||
or $11,$1
|
||||
or $2,$24
|
||||
and $1,$8,$10
|
||||
and $24,$9,$10
|
||||
dsrl $8,8
|
||||
dsrl $9,8
|
||||
dsll $1,8
|
||||
dsll $24,8
|
||||
and $8,$10
|
||||
and $9,$10
|
||||
or $11,$1
|
||||
or $2,$24
|
||||
or $8,$11
|
||||
or $9,$2
|
||||
dsrl $11,$8,32
|
||||
dsrl $2,$9,32
|
||||
dsll $8,32
|
||||
dsll $9,32
|
||||
or $8,$11
|
||||
or $9,$2
|
||||
#endif
|
||||
#endif
|
||||
li $10,1
|
||||
dsll $10,32
|
||||
daddiu $10,-63
|
||||
dsll $10,28
|
||||
daddiu $10,-1 # 0ffffffc0fffffff
|
||||
|
||||
and $8,$10
|
||||
daddiu $10,-3 # 0ffffffc0ffffffc
|
||||
and $9,$10
|
||||
|
||||
sd $8,24($4)
|
||||
dsrl $10,$9,2
|
||||
sd $9,32($4)
|
||||
daddu $10,$9 # s1 = r1 + (r1 >> 2)
|
||||
sd $10,40($4)
|
||||
|
||||
.Lno_key:
|
||||
li $2,0 # return 0
|
||||
jr $31
|
||||
.end poly1305_init_mips
|
||||
|
||||
.align 5
|
||||
.globl poly1305_blocks_mips
|
||||
.ent poly1305_blocks_mips
|
||||
poly1305_blocks_mips:
|
||||
.set noreorder
|
||||
dsrl $6,4 # number of complete blocks
|
||||
bnez $6,poly1305_blocks_internal
|
||||
nop
|
||||
jr $31
|
||||
nop
|
||||
.end poly1305_blocks_mips
|
||||
|
||||
.align 5
|
||||
.ent poly1305_blocks_internal
|
||||
poly1305_blocks_internal:
|
||||
.frame $29,6*8,$31
|
||||
.mask 0x00030000,-8
|
||||
.set noreorder
|
||||
dsubu $29,6*8
|
||||
sd $17,40($29)
|
||||
sd $16,32($29)
|
||||
.set reorder
|
||||
|
||||
ld $12,0($4) # load hash value
|
||||
ld $13,8($4)
|
||||
ld $14,16($4)
|
||||
|
||||
ld $15,24($4) # load key
|
||||
ld $16,32($4)
|
||||
ld $17,40($4)
|
||||
|
||||
.Loop:
|
||||
#if defined(_MIPS_ARCH_MIPS64R6)
|
||||
ld $8,0($5) # load input
|
||||
ld $9,8($5)
|
||||
#else
|
||||
ldl $8,0+MSB($5) # load input
|
||||
ldl $9,8+MSB($5)
|
||||
ldr $8,0+LSB($5)
|
||||
ldr $9,8+LSB($5)
|
||||
#endif
|
||||
daddiu $6,-1
|
||||
daddiu $5,16
|
||||
#ifdef __MIPSEB__
|
||||
#if defined(_MIPS_ARCH_MIPS64R2)
|
||||
dsbh $8,$8 # byte swap
|
||||
dsbh $9,$9
|
||||
dshd $8,$8
|
||||
dshd $9,$9
|
||||
#else
|
||||
ori $10,$0,0xFF
|
||||
dsll $1,$10,32
|
||||
or $10,$1 # 0x000000FF000000FF
|
||||
|
||||
and $11,$8,$10 # byte swap
|
||||
and $2,$9,$10
|
||||
dsrl $1,$8,24
|
||||
dsrl $24,$9,24
|
||||
dsll $11,24
|
||||
dsll $2,24
|
||||
and $1,$10
|
||||
and $24,$10
|
||||
dsll $10,8 # 0x0000FF000000FF00
|
||||
or $11,$1
|
||||
or $2,$24
|
||||
and $1,$8,$10
|
||||
and $24,$9,$10
|
||||
dsrl $8,8
|
||||
dsrl $9,8
|
||||
dsll $1,8
|
||||
dsll $24,8
|
||||
and $8,$10
|
||||
and $9,$10
|
||||
or $11,$1
|
||||
or $2,$24
|
||||
or $8,$11
|
||||
or $9,$2
|
||||
dsrl $11,$8,32
|
||||
dsrl $2,$9,32
|
||||
dsll $8,32
|
||||
dsll $9,32
|
||||
or $8,$11
|
||||
or $9,$2
|
||||
#endif
|
||||
#endif
|
||||
daddu $12,$8 # accumulate input
|
||||
daddu $13,$9
|
||||
sltu $10,$12,$8
|
||||
sltu $11,$13,$9
|
||||
daddu $13,$10
|
||||
|
||||
dmultu ($15,$12) # h0*r0
|
||||
daddu $14,$7
|
||||
sltu $10,$13,$10
|
||||
mflo ($8,$15,$12)
|
||||
mfhi ($9,$15,$12)
|
||||
|
||||
dmultu ($17,$13) # h1*5*r1
|
||||
daddu $10,$11
|
||||
daddu $14,$10
|
||||
mflo ($10,$17,$13)
|
||||
mfhi ($11,$17,$13)
|
||||
|
||||
dmultu ($16,$12) # h0*r1
|
||||
daddu $8,$10
|
||||
daddu $9,$11
|
||||
mflo ($1,$16,$12)
|
||||
mfhi ($25,$16,$12)
|
||||
sltu $10,$8,$10
|
||||
daddu $9,$10
|
||||
|
||||
dmultu ($15,$13) # h1*r0
|
||||
daddu $9,$1
|
||||
sltu $1,$9,$1
|
||||
mflo ($10,$15,$13)
|
||||
mfhi ($11,$15,$13)
|
||||
daddu $25,$1
|
||||
|
||||
dmultu ($17,$14) # h2*5*r1
|
||||
daddu $9,$10
|
||||
daddu $25,$11
|
||||
mflo ($1,$17,$14)
|
||||
|
||||
dmultu ($15,$14) # h2*r0
|
||||
sltu $10,$9,$10
|
||||
daddu $25,$10
|
||||
mflo ($2,$15,$14)
|
||||
|
||||
daddu $9,$1
|
||||
daddu $25,$2
|
||||
sltu $1,$9,$1
|
||||
daddu $25,$1
|
||||
|
||||
li $10,-4 # final reduction
|
||||
and $10,$25
|
||||
dsrl $11,$25,2
|
||||
andi $14,$25,3
|
||||
daddu $10,$11
|
||||
daddu $12,$8,$10
|
||||
sltu $10,$12,$10
|
||||
daddu $13,$9,$10
|
||||
sltu $10,$13,$10
|
||||
daddu $14,$14,$10
|
||||
|
||||
bnez $6,.Loop
|
||||
|
||||
sd $12,0($4) # store hash value
|
||||
sd $13,8($4)
|
||||
sd $14,16($4)
|
||||
|
||||
.set noreorder
|
||||
ld $17,40($29) # epilogue
|
||||
ld $16,32($29)
|
||||
jr $31
|
||||
daddu $29,6*8
|
||||
.end poly1305_blocks_internal
|
||||
|
||||
.align 5
|
||||
.globl poly1305_emit_mips
|
||||
.ent poly1305_emit_mips
|
||||
poly1305_emit_mips:
|
||||
.frame $29,0,$31
|
||||
.set reorder
|
||||
|
||||
ld $10,0($4)
|
||||
ld $11,8($4)
|
||||
ld $1,16($4)
|
||||
|
||||
daddiu $8,$10,5 # compare to modulus
|
||||
sltiu $2,$8,5
|
||||
daddu $9,$11,$2
|
||||
sltu $2,$9,$2
|
||||
daddu $1,$1,$2
|
||||
|
||||
dsrl $1,2 # see if it carried/borrowed
|
||||
dsubu $1,$0,$1
|
||||
nor $2,$0,$1
|
||||
|
||||
and $8,$1
|
||||
and $10,$2
|
||||
and $9,$1
|
||||
and $11,$2
|
||||
or $8,$10
|
||||
or $9,$11
|
||||
|
||||
lwu $10,0($6) # load nonce
|
||||
lwu $11,4($6)
|
||||
lwu $1,8($6)
|
||||
lwu $2,12($6)
|
||||
dsll $11,32
|
||||
dsll $2,32
|
||||
or $10,$11
|
||||
or $1,$2
|
||||
|
||||
daddu $8,$10 # accumulate nonce
|
||||
daddu $9,$1
|
||||
sltu $10,$8,$10
|
||||
daddu $9,$10
|
||||
|
||||
dsrl $10,$8,8 # write mac value
|
||||
dsrl $11,$8,16
|
||||
dsrl $1,$8,24
|
||||
sb $8,0($5)
|
||||
dsrl $2,$8,32
|
||||
sb $10,1($5)
|
||||
dsrl $10,$8,40
|
||||
sb $11,2($5)
|
||||
dsrl $11,$8,48
|
||||
sb $1,3($5)
|
||||
dsrl $1,$8,56
|
||||
sb $2,4($5)
|
||||
dsrl $2,$9,8
|
||||
sb $10,5($5)
|
||||
dsrl $10,$9,16
|
||||
sb $11,6($5)
|
||||
dsrl $11,$9,24
|
||||
sb $1,7($5)
|
||||
|
||||
sb $9,8($5)
|
||||
dsrl $1,$9,32
|
||||
sb $2,9($5)
|
||||
dsrl $2,$9,40
|
||||
sb $10,10($5)
|
||||
dsrl $10,$9,48
|
||||
sb $11,11($5)
|
||||
dsrl $11,$9,56
|
||||
sb $1,12($5)
|
||||
sb $2,13($5)
|
||||
sb $10,14($5)
|
||||
sb $11,15($5)
|
||||
|
||||
jr $31
|
||||
.end poly1305_emit_mips
|
||||
467
src/crypto/zinc/poly1305/poly1305-mips64.pl
Normal file
467
src/crypto/zinc/poly1305/poly1305-mips64.pl
Normal file
@@ -0,0 +1,467 @@
|
||||
#!/usr/bin/env perl
|
||||
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
#
|
||||
# This code is taken from the OpenSSL project but the author, Andy Polyakov,
|
||||
# has relicensed it under the licenses specified in the SPDX header above.
|
||||
# The original headers, including the original license headers, are
|
||||
# included below for completeness.
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# Poly1305 hash for MIPS64.
|
||||
#
|
||||
# May 2016
|
||||
#
|
||||
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
||||
#
|
||||
# IALU/gcc
|
||||
# R1x000 5.64/+120% (big-endian)
|
||||
# Octeon II 3.80/+280% (little-endian)
|
||||
|
||||
######################################################################
|
||||
# There is a number of MIPS ABI in use, O32 and N32/64 are most
|
||||
# widely used. Then there is a new contender: NUBI. It appears that if
|
||||
# one picks the latter, it's possible to arrange code in ABI neutral
|
||||
# manner. Therefore let's stick to NUBI register layout:
|
||||
#
|
||||
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
|
||||
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
||||
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
|
||||
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
|
||||
#
|
||||
# The return value is placed in $a0. Following coding rules facilitate
|
||||
# interoperability:
|
||||
#
|
||||
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
|
||||
# excluded from the rule, because it's specified volatile];
|
||||
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
|
||||
# old code];
|
||||
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
|
||||
#
|
||||
# For reference here is register layout for N32/64 MIPS ABIs:
|
||||
#
|
||||
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
||||
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
||||
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
||||
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
||||
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
||||
#
|
||||
# <appro@openssl.org>
|
||||
#
|
||||
######################################################################
|
||||
|
||||
$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
|
||||
|
||||
die "MIPS64 only" unless ($flavour =~ /64|n32/i);
|
||||
|
||||
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
|
||||
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
|
||||
|
||||
($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
|
||||
($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
|
||||
|
||||
$code.=<<___;
|
||||
#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
|
||||
defined(_MIPS_ARCH_MIPS64R6)) \\
|
||||
&& !defined(_MIPS_ARCH_MIPS64R2)
|
||||
# define _MIPS_ARCH_MIPS64R2
|
||||
#endif
|
||||
|
||||
#if defined(_MIPS_ARCH_MIPS64R6)
|
||||
# define dmultu(rs,rt)
|
||||
# define mflo(rd,rs,rt) dmulu rd,rs,rt
|
||||
# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
|
||||
#else
|
||||
# define dmultu(rs,rt) dmultu rs,rt
|
||||
# define mflo(rd,rs,rt) mflo rd
|
||||
# define mfhi(rd,rs,rt) mfhi rd
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL__
|
||||
# define poly1305_init poly1305_init_mips
|
||||
# define poly1305_blocks poly1305_blocks_mips
|
||||
# define poly1305_emit poly1305_emit_mips
|
||||
#endif
|
||||
|
||||
#if defined(__MIPSEB__) && !defined(MIPSEB)
|
||||
# define MIPSEB
|
||||
#endif
|
||||
|
||||
#ifdef MIPSEB
|
||||
# define MSB 0
|
||||
# define LSB 7
|
||||
#else
|
||||
# define MSB 7
|
||||
# define LSB 0
|
||||
#endif
|
||||
|
||||
.text
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
.align 5
|
||||
.globl poly1305_init
|
||||
.ent poly1305_init
|
||||
poly1305_init:
|
||||
.frame $sp,0,$ra
|
||||
.set reorder
|
||||
|
||||
sd $zero,0($ctx)
|
||||
sd $zero,8($ctx)
|
||||
sd $zero,16($ctx)
|
||||
|
||||
beqz $inp,.Lno_key
|
||||
|
||||
#if defined(_MIPS_ARCH_MIPS64R6)
|
||||
ld $in0,0($inp)
|
||||
ld $in1,8($inp)
|
||||
#else
|
||||
ldl $in0,0+MSB($inp)
|
||||
ldl $in1,8+MSB($inp)
|
||||
ldr $in0,0+LSB($inp)
|
||||
ldr $in1,8+LSB($inp)
|
||||
#endif
|
||||
#ifdef MIPSEB
|
||||
# if defined(_MIPS_ARCH_MIPS64R2)
|
||||
dsbh $in0,$in0 # byte swap
|
||||
dsbh $in1,$in1
|
||||
dshd $in0,$in0
|
||||
dshd $in1,$in1
|
||||
# else
|
||||
ori $tmp0,$zero,0xFF
|
||||
dsll $tmp2,$tmp0,32
|
||||
or $tmp0,$tmp2 # 0x000000FF000000FF
|
||||
|
||||
and $tmp1,$in0,$tmp0 # byte swap
|
||||
and $tmp3,$in1,$tmp0
|
||||
dsrl $tmp2,$in0,24
|
||||
dsrl $tmp4,$in1,24
|
||||
dsll $tmp1,24
|
||||
dsll $tmp3,24
|
||||
and $tmp2,$tmp0
|
||||
and $tmp4,$tmp0
|
||||
dsll $tmp0,8 # 0x0000FF000000FF00
|
||||
or $tmp1,$tmp2
|
||||
or $tmp3,$tmp4
|
||||
and $tmp2,$in0,$tmp0
|
||||
and $tmp4,$in1,$tmp0
|
||||
dsrl $in0,8
|
||||
dsrl $in1,8
|
||||
dsll $tmp2,8
|
||||
dsll $tmp4,8
|
||||
and $in0,$tmp0
|
||||
and $in1,$tmp0
|
||||
or $tmp1,$tmp2
|
||||
or $tmp3,$tmp4
|
||||
or $in0,$tmp1
|
||||
or $in1,$tmp3
|
||||
dsrl $tmp1,$in0,32
|
||||
dsrl $tmp3,$in1,32
|
||||
dsll $in0,32
|
||||
dsll $in1,32
|
||||
or $in0,$tmp1
|
||||
or $in1,$tmp3
|
||||
# endif
|
||||
#endif
|
||||
li $tmp0,1
|
||||
dsll $tmp0,32
|
||||
daddiu $tmp0,-63
|
||||
dsll $tmp0,28
|
||||
daddiu $tmp0,-1 # 0ffffffc0fffffff
|
||||
|
||||
and $in0,$tmp0
|
||||
daddiu $tmp0,-3 # 0ffffffc0ffffffc
|
||||
and $in1,$tmp0
|
||||
|
||||
sd $in0,24($ctx)
|
||||
dsrl $tmp0,$in1,2
|
||||
sd $in1,32($ctx)
|
||||
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
|
||||
sd $tmp0,40($ctx)
|
||||
|
||||
.Lno_key:
|
||||
li $v0,0 # return 0
|
||||
jr $ra
|
||||
.end poly1305_init
|
||||
___
|
||||
{
|
||||
my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
|
||||
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
|
||||
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
.globl poly1305_blocks
|
||||
.ent poly1305_blocks
|
||||
poly1305_blocks:
|
||||
.set noreorder
|
||||
dsrl $len,4 # number of complete blocks
|
||||
bnez $len,poly1305_blocks_internal
|
||||
nop
|
||||
jr $ra
|
||||
nop
|
||||
.end poly1305_blocks
|
||||
|
||||
.align 5
|
||||
.ent poly1305_blocks_internal
|
||||
poly1305_blocks_internal:
|
||||
.frame $sp,6*8,$ra
|
||||
.mask $SAVED_REGS_MASK,-8
|
||||
.set noreorder
|
||||
dsubu $sp,6*8
|
||||
sd $s5,40($sp)
|
||||
sd $s4,32($sp)
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
|
||||
sd $s3,24($sp)
|
||||
sd $s2,16($sp)
|
||||
sd $s1,8($sp)
|
||||
sd $s0,0($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
.set reorder
|
||||
|
||||
ld $h0,0($ctx) # load hash value
|
||||
ld $h1,8($ctx)
|
||||
ld $h2,16($ctx)
|
||||
|
||||
ld $r0,24($ctx) # load key
|
||||
ld $r1,32($ctx)
|
||||
ld $s1,40($ctx)
|
||||
|
||||
.Loop:
|
||||
#if defined(_MIPS_ARCH_MIPS64R6)
|
||||
ld $in0,0($inp) # load input
|
||||
ld $in1,8($inp)
|
||||
#else
|
||||
ldl $in0,0+MSB($inp) # load input
|
||||
ldl $in1,8+MSB($inp)
|
||||
ldr $in0,0+LSB($inp)
|
||||
ldr $in1,8+LSB($inp)
|
||||
#endif
|
||||
daddiu $len,-1
|
||||
daddiu $inp,16
|
||||
#ifdef MIPSEB
|
||||
# if defined(_MIPS_ARCH_MIPS64R2)
|
||||
dsbh $in0,$in0 # byte swap
|
||||
dsbh $in1,$in1
|
||||
dshd $in0,$in0
|
||||
dshd $in1,$in1
|
||||
# else
|
||||
ori $tmp0,$zero,0xFF
|
||||
dsll $tmp2,$tmp0,32
|
||||
or $tmp0,$tmp2 # 0x000000FF000000FF
|
||||
|
||||
and $tmp1,$in0,$tmp0 # byte swap
|
||||
and $tmp3,$in1,$tmp0
|
||||
dsrl $tmp2,$in0,24
|
||||
dsrl $tmp4,$in1,24
|
||||
dsll $tmp1,24
|
||||
dsll $tmp3,24
|
||||
and $tmp2,$tmp0
|
||||
and $tmp4,$tmp0
|
||||
dsll $tmp0,8 # 0x0000FF000000FF00
|
||||
or $tmp1,$tmp2
|
||||
or $tmp3,$tmp4
|
||||
and $tmp2,$in0,$tmp0
|
||||
and $tmp4,$in1,$tmp0
|
||||
dsrl $in0,8
|
||||
dsrl $in1,8
|
||||
dsll $tmp2,8
|
||||
dsll $tmp4,8
|
||||
and $in0,$tmp0
|
||||
and $in1,$tmp0
|
||||
or $tmp1,$tmp2
|
||||
or $tmp3,$tmp4
|
||||
or $in0,$tmp1
|
||||
or $in1,$tmp3
|
||||
dsrl $tmp1,$in0,32
|
||||
dsrl $tmp3,$in1,32
|
||||
dsll $in0,32
|
||||
dsll $in1,32
|
||||
or $in0,$tmp1
|
||||
or $in1,$tmp3
|
||||
# endif
|
||||
#endif
|
||||
daddu $h0,$in0 # accumulate input
|
||||
daddu $h1,$in1
|
||||
sltu $tmp0,$h0,$in0
|
||||
sltu $tmp1,$h1,$in1
|
||||
daddu $h1,$tmp0
|
||||
|
||||
dmultu ($r0,$h0) # h0*r0
|
||||
daddu $h2,$padbit
|
||||
sltu $tmp0,$h1,$tmp0
|
||||
mflo ($d0,$r0,$h0)
|
||||
mfhi ($d1,$r0,$h0)
|
||||
|
||||
dmultu ($s1,$h1) # h1*5*r1
|
||||
daddu $tmp0,$tmp1
|
||||
daddu $h2,$tmp0
|
||||
mflo ($tmp0,$s1,$h1)
|
||||
mfhi ($tmp1,$s1,$h1)
|
||||
|
||||
dmultu ($r1,$h0) # h0*r1
|
||||
daddu $d0,$tmp0
|
||||
daddu $d1,$tmp1
|
||||
mflo ($tmp2,$r1,$h0)
|
||||
mfhi ($d2,$r1,$h0)
|
||||
sltu $tmp0,$d0,$tmp0
|
||||
daddu $d1,$tmp0
|
||||
|
||||
dmultu ($r0,$h1) # h1*r0
|
||||
daddu $d1,$tmp2
|
||||
sltu $tmp2,$d1,$tmp2
|
||||
mflo ($tmp0,$r0,$h1)
|
||||
mfhi ($tmp1,$r0,$h1)
|
||||
daddu $d2,$tmp2
|
||||
|
||||
dmultu ($s1,$h2) # h2*5*r1
|
||||
daddu $d1,$tmp0
|
||||
daddu $d2,$tmp1
|
||||
mflo ($tmp2,$s1,$h2)
|
||||
|
||||
dmultu ($r0,$h2) # h2*r0
|
||||
sltu $tmp0,$d1,$tmp0
|
||||
daddu $d2,$tmp0
|
||||
mflo ($tmp3,$r0,$h2)
|
||||
|
||||
daddu $d1,$tmp2
|
||||
daddu $d2,$tmp3
|
||||
sltu $tmp2,$d1,$tmp2
|
||||
daddu $d2,$tmp2
|
||||
|
||||
li $tmp0,-4 # final reduction
|
||||
and $tmp0,$d2
|
||||
dsrl $tmp1,$d2,2
|
||||
andi $h2,$d2,3
|
||||
daddu $tmp0,$tmp1
|
||||
daddu $h0,$d0,$tmp0
|
||||
sltu $tmp0,$h0,$tmp0
|
||||
daddu $h1,$d1,$tmp0
|
||||
sltu $tmp0,$h1,$tmp0
|
||||
daddu $h2,$h2,$tmp0
|
||||
|
||||
bnez $len,.Loop
|
||||
|
||||
sd $h0,0($ctx) # store hash value
|
||||
sd $h1,8($ctx)
|
||||
sd $h2,16($ctx)
|
||||
|
||||
.set noreorder
|
||||
ld $s5,40($sp) # epilogue
|
||||
ld $s4,32($sp)
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
|
||||
ld $s3,24($sp)
|
||||
ld $s2,16($sp)
|
||||
ld $s1,8($sp)
|
||||
ld $s0,0($sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
jr $ra
|
||||
daddu $sp,6*8
|
||||
.end poly1305_blocks_internal
|
||||
___
|
||||
}
|
||||
{
|
||||
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
|
||||
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
.globl poly1305_emit
|
||||
.ent poly1305_emit
|
||||
poly1305_emit:
|
||||
.frame $sp,0,$ra
|
||||
.set reorder
|
||||
|
||||
ld $tmp0,0($ctx)
|
||||
ld $tmp1,8($ctx)
|
||||
ld $tmp2,16($ctx)
|
||||
|
||||
daddiu $in0,$tmp0,5 # compare to modulus
|
||||
sltiu $tmp3,$in0,5
|
||||
daddu $in1,$tmp1,$tmp3
|
||||
sltu $tmp3,$in1,$tmp3
|
||||
daddu $tmp2,$tmp2,$tmp3
|
||||
|
||||
dsrl $tmp2,2 # see if it carried/borrowed
|
||||
dsubu $tmp2,$zero,$tmp2
|
||||
nor $tmp3,$zero,$tmp2
|
||||
|
||||
and $in0,$tmp2
|
||||
and $tmp0,$tmp3
|
||||
and $in1,$tmp2
|
||||
and $tmp1,$tmp3
|
||||
or $in0,$tmp0
|
||||
or $in1,$tmp1
|
||||
|
||||
lwu $tmp0,0($nonce) # load nonce
|
||||
lwu $tmp1,4($nonce)
|
||||
lwu $tmp2,8($nonce)
|
||||
lwu $tmp3,12($nonce)
|
||||
dsll $tmp1,32
|
||||
dsll $tmp3,32
|
||||
or $tmp0,$tmp1
|
||||
or $tmp2,$tmp3
|
||||
|
||||
daddu $in0,$tmp0 # accumulate nonce
|
||||
daddu $in1,$tmp2
|
||||
sltu $tmp0,$in0,$tmp0
|
||||
daddu $in1,$tmp0
|
||||
|
||||
dsrl $tmp0,$in0,8 # write mac value
|
||||
dsrl $tmp1,$in0,16
|
||||
dsrl $tmp2,$in0,24
|
||||
sb $in0,0($mac)
|
||||
dsrl $tmp3,$in0,32
|
||||
sb $tmp0,1($mac)
|
||||
dsrl $tmp0,$in0,40
|
||||
sb $tmp1,2($mac)
|
||||
dsrl $tmp1,$in0,48
|
||||
sb $tmp2,3($mac)
|
||||
dsrl $tmp2,$in0,56
|
||||
sb $tmp3,4($mac)
|
||||
dsrl $tmp3,$in1,8
|
||||
sb $tmp0,5($mac)
|
||||
dsrl $tmp0,$in1,16
|
||||
sb $tmp1,6($mac)
|
||||
dsrl $tmp1,$in1,24
|
||||
sb $tmp2,7($mac)
|
||||
|
||||
sb $in1,8($mac)
|
||||
dsrl $tmp2,$in1,32
|
||||
sb $tmp3,9($mac)
|
||||
dsrl $tmp3,$in1,40
|
||||
sb $tmp0,10($mac)
|
||||
dsrl $tmp0,$in1,48
|
||||
sb $tmp1,11($mac)
|
||||
dsrl $tmp1,$in1,56
|
||||
sb $tmp2,12($mac)
|
||||
sb $tmp3,13($mac)
|
||||
sb $tmp0,14($mac)
|
||||
sb $tmp1,15($mac)
|
||||
|
||||
jr $ra
|
||||
.end poly1305_emit
|
||||
.rdata
|
||||
.align 2
|
||||
___
|
||||
}
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
next if (/^#!/);
|
||||
last if (!s/^#/\/\// and !/^$/);
|
||||
print;
|
||||
}
|
||||
close SELF;
|
||||
|
||||
$output=pop and open STDOUT,">$output";
|
||||
print $code;
|
||||
close STDOUT;
|
||||
|
||||
@@ -23,7 +23,7 @@ NR_CPUS ?= 4
|
||||
MIRROR := https://download.wireguard.com/qemu-test/distfiles/
|
||||
|
||||
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
|
||||
WIREGUARD_SOURCES := ../../Kbuild ../../Kconfig $(filter-out ../../tools/% ../../tests/%,$(call rwildcard,../../,*.c *.h *.S *.include))
|
||||
WIREGUARD_SOURCES := ../../Kbuild ../../Kconfig $(filter-out ../../tools/% ../../tests/%,$(call rwildcard,../../,*.c *.h *.S *.pl *.include))
|
||||
TOOLS_SOURCES := $(wildcard ../../tools/*.c ../../tools/*.h ../../uapi/*.h ../../crypto/zinc/curve25519/curve25519-hacl64.c ../../crypto/zinc/curve25519/curve25519-fiat32.c)
|
||||
|
||||
default: qemu
|
||||
|
||||
Reference in New Issue
Block a user