gmsm/sm4/ecb_arm64.s

207 lines
4.6 KiB
ArmAsm

//go:build !purego
#include "textflag.h"
#define x V0
#define y V1
#define t0 V2
#define t1 V3
#define t2 V4
#define t3 V5
#define XTMP6 V6
#define XTMP7 V7
#define t4 V10
#define t5 V11
#define t6 V12
#define t7 V13
#define ZERO V16
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26
#include "aesni_macros_arm64.s"
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
#define dstPtr R1
#define srcPtr R2
#define rk R3
#define rkSave R4
#define srcPtrLen R5
LOAD_SM4_AESNI_CONSTS()
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD xk+0(FP), rk
MOVD dst+8(FP), dstPtr
MOVD src+32(FP), srcPtr
MOVD src_len+40(FP), srcPtrLen
MOVD rk, rkSave
ecbSm4Octets:
CMP $128, srcPtrLen
BLT ecbSm4Nibbles
SUB $128, srcPtrLen
MOVD rkSave, rk
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
EOR R0, R0
ecb8BlocksLoop:
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
ADD $16, R0
CMP $128, R0
BNE ecb8BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr)
B ecbSm4Octets
ecbSm4Nibbles:
CMP $64, srcPtrLen
BLT ecbSm4Single
SUB $64, srcPtrLen
MOVD rkSave, rk
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
EOR R0, R0
ecb4BlocksLoop:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE ecb4BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
ecbSm4Single:
CBZ srcPtrLen, ecbSm4Done
MOVD rkSave, rk
EOR R0, R0
CMP $32, srcPtrLen
BEQ ecbSm4Single32
CMP $48, srcPtrLen
BEQ ecbSm4Single48
ecbSm4Single16:
VLD1.P 16(srcPtr), [t0.S4]
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
encryptBlocksLoop1:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop1
VMOV t2.S[0], t3.S[1]
VMOV t1.S[0], t3.S[2]
VMOV t0.S[0], t3.S[3]
VREV32 t3.B16, t3.B16
VST1.P [t3.S4], 16(dstPtr)
B ecbSm4Done
ecbSm4Single32:
VLD1.P 32(srcPtr), [t0.S4, t1.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
encryptBlocksLoop2:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop2
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VST1.P [t0.S4, t1.S4], 32(dstPtr)
B ecbSm4Done
ecbSm4Single48:
VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
encryptBlocksLoop3:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop3
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr)
ecbSm4Done:
RET