align format

This commit is contained in:
emmansun 2022-05-01 18:06:47 +08:00
parent 488c0db854
commit 57d899613d
3 changed files with 358 additions and 342 deletions

View File

@ -66,179 +66,179 @@ DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
#define SM4_SBOX(x, y) \
; \ //############################# inner affine ############################//
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \
VEOR y.B16, XTMP7.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \
VEOR y.B16, XTMP7.B16, x.B16
; \ //############################# inner affine ############################//
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M1L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \
VEOR y.B16, XTMP7.B16, x.B16; \
VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M2L.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \
VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \
VEOR y.B16, XTMP7.B16, x.B16
#define SM4_TAO_L1(x, y) \
SM4_SBOX(x, y); \
; \ //#################### 4 parallel L1 linear transforms ##################//
VTBL R08_MASK.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \
VEOR XTMP7.B16, y.B16, y.B16; \
VSHL $2, y.S4, XTMP7.S4; \
VUSHR $30, y.S4, y.S4; \
VORR y.B16, XTMP7.B16, y.B16; \
VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \
VEOR XTMP7.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16
SM4_SBOX(x, y); \
; \ //#################### 4 parallel L1 linear transforms ##################//
VTBL R08_MASK.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \
VEOR XTMP7.B16, y.B16, y.B16; \
VSHL $2, y.S4, XTMP7.S4; \
VUSHR $30, y.S4, y.S4; \
VORR y.B16, XTMP7.B16, y.B16; \
VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \
VEOR XTMP7.B16, x.B16, x.B16; \
VEOR y.B16, x.B16, x.B16
#define SM4_TAO_L2(x, y) \
SM4_SBOX(x, y); \
; \ //#################### 4 parallel L2 linear transforms ##################//
VSHL $13, x.S4, XTMP6.S4; \
VUSHR $19, x.S4, y.S4; \
VORR XTMP6.B16, y.B16, y.B16; \
VSHL $23, x.S4, XTMP6.S4; \
VUSHR $9, x.S4, XTMP7.S4; \
VORR XTMP6.B16, XTMP7.B16, XTMP7.B16; \
VEOR XTMP7.B16, y.B16, y.B16; \
VEOR x.B16, y.B16, x.B16
SM4_SBOX(x, y); \
; \ //#################### 4 parallel L2 linear transforms ##################//
VSHL $13, x.S4, XTMP6.S4; \
VUSHR $19, x.S4, y.S4; \
VORR XTMP6.B16, y.B16, y.B16; \
VSHL $23, x.S4, XTMP6.S4; \
VUSHR $9, x.S4, XTMP7.S4; \
VORR XTMP6.B16, XTMP7.B16, XTMP7.B16; \
VEOR XTMP7.B16, y.B16, y.B16; \
VEOR x.B16, y.B16, x.B16
#define SM4_ROUND(RK, x, y, t0, t1, t2, t3) \
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y); \
VEOR x.B16, t0.B16, t0.B16
MOVW.P 4(RK), R19; \
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y); \
VEOR x.B16, t0.B16, t0.B16
#define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \
MOVW.P 4(R9), R19; \
VMOV R19, x.S[0]; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L2(x, y); \
VEOR x.B16, t0.B16, t0.B16; \
VMOV t0.S[0], R2; \
MOVW.P R2, 4(R10); \
MOVW.P R2, -4(R11)
MOVW.P 4(R9), R19; \
VMOV R19, x.S[0]; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L2(x, y); \
VEOR x.B16, t0.B16, t0.B16; \
VMOV t0.S[0], R2; \
MOVW.P R2, 4(R10); \
MOVW.P R2, -4(R11)
#define load_global_data_1() \
LDP nibble_mask<>(SB), (R0, R1) \
VMOV R0, NIBBLE_MASK.D[0] \
VMOV R1, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R0, R1) \
VMOV R0, M1L.D[0] \
VMOV R1, M1L.D[1] \
LDP m1_high<>(SB), (R0, R1) \
VMOV R0, M1H.D[0] \
VMOV R1, M1H.D[1] \
LDP m2_low<>(SB), (R0, R1) \
VMOV R0, M2L.D[0] \
VMOV R1, M2L.D[1] \
LDP m2_high<>(SB), (R0, R1) \
VMOV R0, M2H.D[0] \
VMOV R1, M2H.D[1] \
LDP fk_mask<>(SB), (R0, R1) \
VMOV R0, FK_MASK.D[0] \
VMOV R1, FK_MASK.D[1] \
LDP inverse_shift_rows<>(SB), (R0, R1) \
VMOV R0, INVERSE_SHIFT_ROWS.D[0] \
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
LDP nibble_mask<>(SB), (R0, R1) \
VMOV R0, NIBBLE_MASK.D[0] \
VMOV R1, NIBBLE_MASK.D[1] \
LDP m1_low<>(SB), (R0, R1) \
VMOV R0, M1L.D[0] \
VMOV R1, M1L.D[1] \
LDP m1_high<>(SB), (R0, R1) \
VMOV R0, M1H.D[0] \
VMOV R1, M1H.D[1] \
LDP m2_low<>(SB), (R0, R1) \
VMOV R0, M2L.D[0] \
VMOV R1, M2L.D[1] \
LDP m2_high<>(SB), (R0, R1) \
VMOV R0, M2H.D[0] \
VMOV R1, M2H.D[1] \
LDP fk_mask<>(SB), (R0, R1) \
VMOV R0, FK_MASK.D[0] \
VMOV R1, FK_MASK.D[1] \
LDP inverse_shift_rows<>(SB), (R0, R1) \
VMOV R0, INVERSE_SHIFT_ROWS.D[0] \
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
#define load_global_data_2() \
load_global_data_1() \
LDP r08_mask<>(SB), (R0, R1) \
VMOV R0, R08_MASK.D[0] \
VMOV R1, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R0, R1) \
VMOV R0, R16_MASK.D[0] \
VMOV R1, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R0, R1) \
VMOV R0, R24_MASK.D[0] \
VMOV R1, R24_MASK.D[1]
load_global_data_1() \
LDP r08_mask<>(SB), (R0, R1) \
VMOV R0, R08_MASK.D[0] \
VMOV R1, R08_MASK.D[1] \
LDP r16_mask<>(SB), (R0, R1) \
VMOV R0, R16_MASK.D[0] \
VMOV R1, R16_MASK.D[1] \
LDP r24_mask<>(SB), (R0, R1) \
VMOV R0, R24_MASK.D[0] \
VMOV R1, R24_MASK.D[1]
#define SM4EKEY_EXPORT_KEYS() \
VMOV V9.S[3], V10.S[0] \
VMOV V9.S[2], V10.S[1] \
VMOV V9.S[1], V10.S[2] \
VMOV V9.S[0], V10.S[3] \
VMOV V8.S[3], V11.S[0] \
VMOV V8.S[2], V11.S[1] \
VMOV V8.S[1], V11.S[2] \
VMOV V8.S[0], V11.S[3] \
VST1.P [V8.S4, V9.S4], 32(R10) \
VST1 [V10.S4, V11.S4], (R11) \
SUB $32, R11, R11
VMOV V9.S[3], V10.S[0] \
VMOV V9.S[2], V10.S[1] \
VMOV V9.S[1], V10.S[2] \
VMOV V9.S[0], V10.S[3] \
VMOV V8.S[3], V11.S[0] \
VMOV V8.S[2], V11.S[1] \
VMOV V8.S[1], V11.S[2] \
VMOV V8.S[0], V11.S[3] \
VST1.P [V8.S4, V9.S4], 32(R10) \
VST1 [V10.S4, V11.S4], (R11) \
SUB $32, R11, R11
#define SM4E_ROUND() \
VLD1.P 16(R10), [V8.B16] \
VREV32 V8.B16, V8.B16 \
WORD $0x0884c0ce \
WORD $0x2884c0ce \
WORD $0x4884c0ce \
WORD $0x6884c0ce \
WORD $0x8884c0ce \
WORD $0xa884c0ce \
WORD $0xc884c0ce \
WORD $0xe884c0ce \
VREV32 V8.B16, V8.B16 \
VST1.P [V8.B16], 16(R9)
VLD1.P 16(R10), [V8.B16] \
VREV32 V8.B16, V8.B16 \
WORD $0x0884c0ce \
WORD $0x2884c0ce \
WORD $0x4884c0ce \
WORD $0x6884c0ce \
WORD $0x8884c0ce \
WORD $0xa884c0ce \
WORD $0xc884c0ce \
WORD $0xe884c0ce \
VREV32 V8.B16, V8.B16 \
VST1.P [V8.B16], 16(R9)
// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVD key+0(FP), R8
MOVD ck+8(FP), R9
MOVD enc+16(FP), R10
MOVD dec+24(FP), R11
MOVD inst+32(FP), R12
MOVD key+0(FP), R8
MOVD ck+8(FP), R9
MOVD enc+16(FP), R10
MOVD dec+24(FP), R11
MOVD inst+32(FP), R12
CMP $1, R12
BEQ sm4ekey
CMP $1, R12
BEQ sm4ekey
load_global_data_1()
load_global_data_1()
VLD1 (R8), [t0.B16]
VREV32 t0.B16, t0.B16
VEOR t0.B16, FK_MASK.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
VLD1 (R8), [t0.B16]
VREV32 t0.B16, t0.B16
VEOR t0.B16, FK_MASK.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
EOR R0, R0
ADD $124, R11
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
ADD $124, R11
VEOR ZERO.B16, ZERO.B16, ZERO.B16
ksLoop:
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3)
SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0)
SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1)
SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE ksLoop
RET
ADD $16, R0
CMP $128, R0
BNE ksLoop
RET
sm4ekey:
LDP fk_mask<>(SB), (R0, R1)
VMOV R0, FK_MASK.D[0]
VMOV R1, FK_MASK.D[1]
LDP fk_mask<>(SB), (R0, R1)
VMOV R0, FK_MASK.D[0]
VMOV R1, FK_MASK.D[1]
VLD1 (R8), [V9.B16]
VREV32 V9.B16, V9.B16
VEOR FK_MASK.B16, V9.B16, V9.B16
ADD $96, R11
ADD $96, R11
VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S
WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S
SM4EKEY_EXPORT_KEYS()
SM4EKEY_EXPORT_KEYS()
WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S
WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S
@ -252,141 +252,141 @@ sm4ekey:
WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S
WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S
SM4EKEY_EXPORT_KEYS()
RET
RET
// func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int)
TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+32(FP), R10
MOVD src_len+40(FP), R12
MOVD inst+56(FP), R11
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+32(FP), R10
MOVD src_len+40(FP), R12
MOVD inst+56(FP), R11
CMP $1, R11
BEQ sm4niblocks
CMP $1, R11
BEQ sm4niblocks
VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4]
VMOV V5.S[0], t0.S[0]
VMOV V5.S[1], t1.S[0]
VMOV V5.S[2], t2.S[0]
VMOV V5.S[3], t3.S[0]
VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4]
VMOV V5.S[0], t0.S[0]
VMOV V5.S[1], t1.S[0]
VMOV V5.S[2], t2.S[0]
VMOV V5.S[3], t3.S[0]
VMOV V6.S[0], t0.S[1]
VMOV V6.S[1], t1.S[1]
VMOV V6.S[2], t2.S[1]
VMOV V6.S[3], t3.S[1]
VMOV V6.S[0], t0.S[1]
VMOV V6.S[1], t1.S[1]
VMOV V6.S[2], t2.S[1]
VMOV V6.S[3], t3.S[1]
VMOV V7.S[0], t0.S[2]
VMOV V7.S[1], t1.S[2]
VMOV V7.S[2], t2.S[2]
VMOV V7.S[3], t3.S[2]
VMOV V7.S[0], t0.S[2]
VMOV V7.S[1], t1.S[2]
VMOV V7.S[2], t2.S[2]
VMOV V7.S[3], t3.S[2]
VMOV V8.S[0], t0.S[3]
VMOV V8.S[1], t1.S[3]
VMOV V8.S[2], t2.S[3]
VMOV V8.S[3], t3.S[3]
VMOV V8.S[0], t0.S[3]
VMOV V8.S[1], t1.S[3]
VMOV V8.S[2], t2.S[3]
VMOV V8.S[3], t3.S[3]
load_global_data_2()
load_global_data_2()
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
encryptBlocksLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VMOV t3.S[0], V8.S[0]
VMOV t2.S[0], V8.S[1]
VMOV t1.S[0], V8.S[2]
VMOV t0.S[0], V8.S[3]
VST1.P [V8.B16], 16(R9)
VMOV t3.S[0], V8.S[0]
VMOV t2.S[0], V8.S[1]
VMOV t1.S[0], V8.S[2]
VMOV t0.S[0], V8.S[3]
VST1.P [V8.B16], 16(R9)
VMOV t3.S[1], V8.S[0]
VMOV t2.S[1], V8.S[1]
VMOV t1.S[1], V8.S[2]
VMOV t0.S[1], V8.S[3]
VST1.P [V8.B16], 16(R9)
VMOV t3.S[1], V8.S[0]
VMOV t2.S[1], V8.S[1]
VMOV t1.S[1], V8.S[2]
VMOV t0.S[1], V8.S[3]
VST1.P [V8.B16], 16(R9)
VMOV t3.S[2], V8.S[0]
VMOV t2.S[2], V8.S[1]
VMOV t1.S[2], V8.S[2]
VMOV t0.S[2], V8.S[3]
VST1.P [V8.B16], 16(R9)
VMOV t3.S[2], V8.S[0]
VMOV t2.S[2], V8.S[1]
VMOV t1.S[2], V8.S[2]
VMOV t0.S[2], V8.S[3]
VST1.P [V8.B16], 16(R9)
VMOV t3.S[3], V8.S[0]
VMOV t2.S[3], V8.S[1]
VMOV t1.S[3], V8.S[2]
VMOV t0.S[3], V8.S[3]
VST1 [V8.B16], (R9)
RET
VMOV t3.S[3], V8.S[0]
VMOV t2.S[3], V8.S[1]
VMOV t1.S[3], V8.S[2]
VMOV t0.S[3], V8.S[3]
VST1 [V8.B16], (R9)
RET
sm4niblocks:
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4]
VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4]
sm4niblockloop:
SM4E_ROUND()
SM4E_ROUND()
SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes
CBNZ R12, sm4niblockloop
RET
RET
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+16(FP), R10
MOVD inst+24(FP), R11
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+16(FP), R10
MOVD inst+24(FP), R11
CMP $1, R11
BEQ sm4niblock
CMP $1, R11
BEQ sm4niblock
VLD1 (R10), [t0.S4]
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
VLD1 (R10), [t0.S4]
VREV32 t0.B16, t0.B16
VMOV t0.S[1], t1.S[0]
VMOV t0.S[2], t2.S[0]
VMOV t0.S[3], t3.S[0]
load_global_data_2()
load_global_data_2()
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
encryptBlockLoop:
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlockLoop
ADD $16, R0
CMP $128, R0
BNE encryptBlockLoop
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VMOV t3.S[0], V8.S[0]
VMOV t2.S[0], V8.S[1]
VMOV t1.S[0], V8.S[2]
VMOV t0.S[0], V8.S[3]
VST1 [V8.B16], (R9)
RET
VMOV t3.S[0], V8.S[0]
VMOV t2.S[0], V8.S[1]
VMOV t1.S[0], V8.S[2]
VMOV t0.S[0], V8.S[3]
VST1 [V8.B16], (R9)
RET
sm4niblock:
VLD1 (R10), [V8.B16]
@ -403,4 +403,4 @@ sm4niblock:
WORD $0x6884c0ce //SM4E V8.4S, V3.4S
VREV32 V8.B16, V8.B16
VST1 [V8.B16], (R9)
RET
RET

View File

@ -60,73 +60,73 @@
VEOR T3.B16, ACCM.B16, ACCM.B16
#define sm4eEnc1block() \
WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S
WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S
WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S
WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S
WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S
WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S
WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S
WORD $0x4087c0ce //SM4E V0.4S, V26.4S
WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S
WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S
WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S
WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S
WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S
WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S
WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S
WORD $0x4087c0ce //SM4E V0.4S, V26.4S
#define sm4eEnc8blocks() \
sm4eEnc1block() \
WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S
WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S
WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S
WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S
WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S
WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S
WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S
WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S
WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S
WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S
WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S
WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S
WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S
WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S
WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S
WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S
WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S
WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S
WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S
WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S
WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S
WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S
WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S
WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S
WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S
WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S
WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S
WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S
WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S
WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S
WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S
WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S
WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S
WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S
WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S
WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S
WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S
WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S
WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S
WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S
WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S
WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S
WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S
WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S
WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S
WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S
WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S
WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S
WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S
WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S
WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S
WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S
WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S
WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S
WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S
WORD $0x4787c0ce //SM4E V7.4S, V26.4S
sm4eEnc1block() \
WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S
WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S
WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S
WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S
WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S
WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S
WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S
WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S
WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S
WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S
WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S
WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S
WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S
WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S
WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S
WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S
WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S
WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S
WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S
WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S
WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S
WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S
WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S
WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S
WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S
WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S
WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S
WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S
WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S
WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S
WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S
WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S
WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S
WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S
WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S
WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S
WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S
WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S
WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S
WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S
WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S
WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S
WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S
WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S
WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S
WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S
WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S
WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S
WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S
WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S
WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S
WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S
WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S
WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S
WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S
WORD $0x4787c0ce //SM4E V7.4S, V26.4S
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
@ -193,15 +193,15 @@ octetsLoop:
VADD B6.S4, INC.S4, B7.S4
VADD B7.S4, INC.S4, CTR.S4
sm4eEnc8blocks()
VREV32 B0.B16, B0.B16
VREV32 B1.B16, B1.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
sm4eEnc8blocks()
VREV32 B0.B16, B0.B16
VREV32 B1.B16, B1.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
// XOR plaintext and store ciphertext
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
@ -254,14 +254,14 @@ singlesLoop:
BLT tail
SUB $16, srcPtrLen
VMOV CTR.B16, B0.B16
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
sm4eEnc1block()
VREV32 B0.B16, B0.B16
singlesLast:
VLD1.P 16(srcPtr), [T0.B16]
VEOR T0.B16, B0.B16, B0.B16
VLD1.P 16(srcPtr), [T0.B16]
VEOR T0.B16, B0.B16, B0.B16
encReduce:
VST1.P [B0.B16], 16(dstPtr)
@ -315,9 +315,9 @@ ld1:
VMOV H1, T3.B[0]
ld0:
MOVD ZR, srcPtrLen
VMOV CTR.B16, B0.B16
sm4eEnc1block()
VREV32 B0.B16, B0.B16
VMOV CTR.B16, B0.B16
sm4eEnc1block()
VREV32 B0.B16, B0.B16
tailLast:
VEOR T0.B16, B0.B16, B0.B16
@ -326,7 +326,7 @@ tailLast:
done:
VST1 [ACC0.B16], (tPtr)
RET
RET
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
@ -381,15 +381,15 @@ octetsLoop:
VADD B6.S4, INC.S4, B7.S4
VADD B7.S4, INC.S4, CTR.S4
sm4eEnc8blocks()
sm4eEnc8blocks()
VREV32 B0.B16, T1.B16
VREV32 B1.B16, T2.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
VREV32 B2.B16, B2.B16
VREV32 B3.B16, B3.B16
VREV32 B4.B16, B4.B16
VREV32 B5.B16, B5.B16
VREV32 B6.B16, B6.B16
VREV32 B7.B16, B7.B16
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
VEOR B0.B16, T1.B16, T1.B16
@ -443,17 +443,17 @@ singlesLoop:
CMP $16, srcPtrLen
BLT tail
SUB $16, srcPtrLen
VLD1.P 16(srcPtr), [T0.B16]
VREV64 T0.B16, B5.B16
VMOV CTR.B16, B0.B16
VLD1.P 16(srcPtr), [T0.B16]
VREV64 T0.B16, B5.B16
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
sm4eEnc1block()
VREV32 B0.B16, B0.B16
singlesLast:
VEOR T0.B16, B0.B16, B0.B16
VEOR T0.B16, B0.B16, B0.B16
VST1.P [B0.B16], 16(dstPtr)
VEOR ACC0.B16, B5.B16, B5.B16
@ -467,13 +467,13 @@ singlesLast:
B singlesLoop
tail:
CBZ srcPtrLen, done
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
VMOV CTR.B16, B0.B16
VADD CTR.S4, INC.S4, CTR.S4
sm4eEnc1block()
VREV32 B0.B16, B0.B16
tailLast:
// Assuming it is safe to load past dstPtr due to the presence of the tag
// B5 stored last ciphertext
// B5 stored last ciphertext
VLD1 (srcPtr), [B5.B16]
VEOR B5.B16, B0.B16, B0.B16
@ -522,4 +522,4 @@ ld0:
done:
VST1 [ACC0.B16], (tPtr)
RET
RET

View File

@ -64,6 +64,22 @@ var cbcSM4Tests = []struct {
0x62, 0xb5, 0xe7, 0x50, 0x44, 0xea, 0x24, 0xcc, 0x9b, 0x5e, 0x07, 0x48, 0x04, 0x89, 0xa2, 0x74,
},
},
{
"7 blocks",
[]byte("0123456789ABCDEF"),
[]byte("0123456789ABCDEF"),
[]byte("Hello World Hello World Hello World Hello World Hello World Hello World Hello World Hello World Hello World Hell"),
[]byte{
0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb,
0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a,
0xe0, 0x02, 0xd6, 0xe4, 0xf5, 0x66, 0x87, 0xc4, 0xcc, 0x54, 0x1d, 0x1f, 0x1c, 0xc4, 0x2f, 0xe6,
0xe5, 0x1d, 0xea, 0x52, 0xb8, 0x0c, 0xc8, 0xbe, 0xae, 0xcc, 0x44, 0xa8, 0x51, 0x81, 0x08, 0x60,
0xb6, 0x09, 0x7b, 0xb8, 0x7e, 0xdb, 0x53, 0x4b, 0xea, 0x2a, 0xc6, 0xa1, 0xe5, 0xa0, 0x2a, 0xe9,
0x22, 0x65, 0x5b, 0xa3, 0xb9, 0xcc, 0x63, 0x92, 0x16, 0x0e, 0x2f, 0xf4, 0x3b, 0x93, 0x06, 0x82,
0xb3, 0x8c, 0x26, 0x2e, 0x06, 0x51, 0x34, 0x2c, 0xe4, 0x3d, 0xd0, 0xc7, 0x2b, 0x8f, 0x31, 0x15,
0xb7, 0x8f, 0xd0, 0x47, 0x45, 0x40, 0xec, 0x02, 0x1b, 0xef, 0xc1, 0xd2, 0xe5, 0xa2, 0x35, 0xd2,
},
},
{
"9 blocks",
[]byte("0123456789ABCDEF"),