gmsm/sm4/gcm_arm64.s

374 lines
8.8 KiB
ArmAsm
Raw Normal View History

#include "textflag.h"
#define B0 V0
#define B1 V1
#define B2 V2
#define B3 V3
#define B4 V4
#define B5 V5
#define B6 V6
#define B7 V7
#define ACC0 V8
#define ACC1 V9
#define ACCM V10
#define T0 V11
#define T1 V12
#define T2 V13
#define T3 V14
#define POLY V15
#define ZERO V16
#define INC V17
#define CTR V18
#define K0 V19
#define K1 V20
#define K2 V21
#define K3 V22
#define K4 V23
#define K5 V24
#define K6 V25
#define K7 V26
#define K8 V27
#define K9 V28
#define K10 V29
#define K11 V30
#define KLAST V31
#define reduce() \
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
VEOR T0.B16, ACC1.B16, ACC1.B16 \
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
VEOR T0.B16, ACC0.B16, ACC0.B16 \
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
VEOR T0.B16, ACC1.B16, ACC1.B16 \
VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
// func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
#define pTbl R0
#define tMsk R1
#define tPtr R2
#define plen R3
#define dlen R4
MOVD $0xC2, R1
LSL $56, R1
MOVD $1, R0
VMOV R1, POLY.D[0]
VMOV R0, POLY.D[1]
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD productTable+0(FP), pTbl
MOVD tagMask+8(FP), tMsk
MOVD T+16(FP), tPtr
MOVD pLen+24(FP), plen
MOVD dLen+32(FP), dlen
VLD1 (tPtr), [ACC0.B16]
VLD1 (tMsk), [B1.B16]
LSL $3, plen
LSL $3, dlen
VMOV dlen, B0.D[0]
VMOV plen, B0.D[1]
ADD $14*16, pTbl
VLD1.P (pTbl), [T1.B16, T2.B16]
VEOR ACC0.B16, B0.B16, B0.B16
VEXT $8, B0.B16, B0.B16, T0.B16
VEOR B0.B16, T0.B16, T0.B16
VPMULL B0.D1, T1.D1, ACC1.Q1
VPMULL2 B0.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
reduce()
VREV64 ACC0.B16, ACC0.B16
VEOR B1.B16, ACC0.B16, ACC0.B16
VST1 [ACC0.B16], (tPtr)
RET
#undef pTbl
#undef tMsk
#undef tPtr
#undef plen
#undef dlen
2022-01-12 16:06:39 +08:00
#define SM4_SBOX(x, y, z, z1, z2) \
2022-01-13 08:40:32 +08:00
VMOVQ $0x0F0F0F0F0F0F0F0F, $0x0F0F0F0F0F0F0F0F, z1; \ // nibble mask
VAND x.B16, z1.B16, z2.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0x9197E2E474720701, $0xC7C1B4B222245157, z; \
2022-01-13 08:40:32 +08:00
VTBL z2.B16, [z.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, z1.B16, z2.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0xE240AB09EB49A200, $0xF052B91BF95BB012, z; \
2022-01-13 08:40:32 +08:00
VTBL z2.B16, [z.B16], z2.B16; \
VEOR y.B16, z2.B16, x.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0x0B0E0104070A0D00, $0x0306090C0F020508, z; \
2022-01-13 08:40:32 +08:00
VTBL z.B16, [x.B16], x.B16; \
AESE ZERO.B16, x.B16; \
VAND x.B16, z1.B16, z2.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0x5B67F2CEA19D0834, $0xEDD14478172BBE82, z; \
2022-01-13 08:40:32 +08:00
VTBL z2.B16, [z.B16], y.B16; \
VUSHR $4, x.D2, x.D2; \
VAND x.B16, z1.B16, z2.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0xAE7201DD73AFDC00, $0x11CDBE62CC1063BF, z; \
2022-01-13 08:40:32 +08:00
VTBL z2.B16, [z.B16], z2.B16; \
2022-01-12 16:40:19 +08:00
VEOR y.B16, z2.B16, x.B16
2022-01-12 16:06:39 +08:00
#define SM4_TAO_L1(x, y, z, z1, z2) \
2022-01-13 08:40:32 +08:00
SM4_SBOX(x, y, z, z1, z2); \
2022-01-13 10:51:02 +08:00
VMOVQ $0x0605040702010003, $0x0E0D0C0F0A09080B, z; \
2022-01-13 08:40:32 +08:00
VTBL z.B16, [x.B16], y.B16; \
VEOR y.B16, x.B16, y.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0x0504070601000302, $0x0D0C0F0E09080B0A, z; \
2022-01-13 08:40:32 +08:00
VTBL z.B16, [x.B16], z.B16; \
VEOR z.B16, y.B16, y.B16; \
VSHL $2, y.S4, z.S4; \
VUSHR $30, y.S4, y.S4; \
VORR y.B16, z.B16, y.B16; \
2022-01-13 10:51:02 +08:00
VMOVQ $0x0407060500030201, $0x0C0F0E0D080B0A09, z; \
2022-01-13 08:40:32 +08:00
VTBL z.B16, [x.B16], z.B16; \
VEOR z.B16, x.B16, x.B16; \
2022-01-12 16:40:19 +08:00
VEOR y.B16, x.B16, x.B16
2022-01-12 16:06:39 +08:00
2022-01-13 08:40:32 +08:00
#define SM4_ROUND(index, RK, x, y, z, z1, z2, t0, t1, t2, t3) \
MOVW (index*4)(RK), R19; \
2022-01-12 16:40:19 +08:00
VMOV R19, x.S4; \
VEOR t1.B16, x.B16, x.B16; \
VEOR t2.B16, x.B16, x.B16; \
VEOR t3.B16, x.B16, x.B16; \
SM4_TAO_L1(x, y, z, z1, z2); \
VEOR x.B16, t0.B16, t0.B16
2022-01-12 16:06:39 +08:00
2022-01-13 09:00:08 +08:00
// func precomputeTableAsm(productTable *[256]byte, src *[16]byte)
TEXT ·precomputeTableAsm(SB),NOSPLIT,$0
#define pTbl R0
2022-01-13 09:00:08 +08:00
#define SRC R1
2022-01-12 16:06:39 +08:00
#define I R2
MOVD productTable+0(FP), pTbl
2022-01-13 09:00:08 +08:00
MOVD src+8(FP), SRC
MOVD $0xC2, I
LSL $56, I
VMOV I, POLY.D[0]
MOVD $1, I
VMOV I, POLY.D[1]
VEOR ZERO.B16, ZERO.B16, ZERO.B16
2022-01-13 09:00:08 +08:00
VLD1 (SRC), [B0.B16]
VREV64 B0.B16, B0.B16
// Multiply by 2 modulo P
VMOV B0.D[0], I
ASR $63, I
VMOV I, T1.D[0]
VMOV I, T1.D[1]
VAND POLY.B16, T1.B16, T1.B16
VUSHR $63, B0.D2, T2.D2
VEXT $8, ZERO.B16, T2.B16, T2.B16
VSHL $1, B0.D2, B0.D2
VEOR T1.B16, B0.B16, B0.B16
VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
// Karatsuba pre-computation
VEXT $8, B0.B16, B0.B16, B1.B16
VEOR B0.B16, B1.B16, B1.B16
ADD $14*16, pTbl
2022-01-02 17:11:42 +08:00
VST1 [B0.B16, B1.B16], (pTbl)
SUB $2*16, pTbl
VMOV B0.B16, B2.B16
VMOV B1.B16, B3.B16
MOVD $7, I
initLoop:
// Compute powers of H
SUBS $1, I
VPMULL B0.D1, B2.D1, T1.Q1
VPMULL2 B0.D2, B2.D2, T0.Q1
VPMULL B1.D1, B3.D1, T2.Q1
VEOR T0.B16, T2.B16, T2.B16
VEOR T1.B16, T2.B16, T2.B16
VEXT $8, ZERO.B16, T2.B16, T3.B16
VEXT $8, T2.B16, ZERO.B16, T2.B16
VEOR T2.B16, T0.B16, T0.B16
VEOR T3.B16, T1.B16, T1.B16
VPMULL POLY.D1, T0.D1, T2.Q1
VEXT $8, T0.B16, T0.B16, T0.B16
VEOR T2.B16, T0.B16, T0.B16
VPMULL POLY.D1, T0.D1, T2.Q1
VEXT $8, T0.B16, T0.B16, T0.B16
VEOR T2.B16, T0.B16, T0.B16
VEOR T1.B16, T0.B16, B2.B16
VMOV B2.B16, B3.B16
VEXT $8, B2.B16, B2.B16, B2.B16
VEOR B2.B16, B3.B16, B3.B16
2022-01-02 17:11:42 +08:00
VST1 [B2.B16, B3.B16], (pTbl)
SUB $2*16, pTbl
BNE initLoop
RET
#undef I
2022-01-13 09:00:08 +08:00
#undef SRC
#undef pTbl
// func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
TEXT ·gcmSm4Data(SB),NOSPLIT,$0
#define pTbl R0
#define aut R1
#define tPtr R2
#define autLen R3
#define H0 R4
#define pTblSave R5
#define mulRound(X) \
VLD1.P 32(pTbl), [T1.B16, T2.B16] \
VREV64 X.B16, X.B16 \
VEXT $8, X.B16, X.B16, T0.B16 \
VEOR X.B16, T0.B16, T0.B16 \
VPMULL X.D1, T1.D1, T3.Q1 \
VEOR T3.B16, ACC1.B16, ACC1.B16 \
VPMULL2 X.D2, T1.D2, T3.Q1 \
VEOR T3.B16, ACC0.B16, ACC0.B16 \
VPMULL T0.D1, T2.D1, T3.Q1 \
VEOR T3.B16, ACCM.B16, ACCM.B16
MOVD productTable+0(FP), pTbl
MOVD data_base+8(FP), aut
MOVD data_len+16(FP), autLen
MOVD T+32(FP), tPtr
2022-01-02 21:24:51 +08:00
//VEOR ACC0.B16, ACC0.B16, ACC0.B16
VLD1 (tPtr), [ACC0.B16]
CBZ autLen, dataBail
MOVD $0xC2, H0
LSL $56, H0
VMOV H0, POLY.D[0]
MOVD $1, H0
VMOV H0, POLY.D[1]
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD pTbl, pTblSave
CMP $13, autLen
BEQ dataTLS
CMP $128, autLen
BLT startSinglesLoop
B octetsLoop
dataTLS:
ADD $14*16, pTbl
VLD1.P (pTbl), [T1.B16, T2.B16]
VEOR B0.B16, B0.B16, B0.B16
MOVD (aut), H0
VMOV H0, B0.D[0]
MOVW 8(aut), H0
VMOV H0, B0.S[2]
MOVB 12(aut), H0
VMOV H0, B0.B[12]
MOVD $0, autLen
B dataMul
octetsLoop:
CMP $128, autLen
BLT startSinglesLoop
SUB $128, autLen
VLD1.P 32(aut), [B0.B16, B1.B16]
VLD1.P 32(pTbl), [T1.B16, T2.B16]
VREV64 B0.B16, B0.B16
VEOR ACC0.B16, B0.B16, B0.B16
VEXT $8, B0.B16, B0.B16, T0.B16
VEOR B0.B16, T0.B16, T0.B16
VPMULL B0.D1, T1.D1, ACC1.Q1
VPMULL2 B0.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
mulRound(B1)
VLD1.P 32(aut), [B2.B16, B3.B16]
mulRound(B2)
mulRound(B3)
VLD1.P 32(aut), [B4.B16, B5.B16]
mulRound(B4)
mulRound(B5)
VLD1.P 32(aut), [B6.B16, B7.B16]
mulRound(B6)
mulRound(B7)
MOVD pTblSave, pTbl
reduce()
B octetsLoop
startSinglesLoop:
ADD $14*16, pTbl
VLD1.P (pTbl), [T1.B16, T2.B16]
singlesLoop:
CMP $16, autLen
BLT dataEnd
SUB $16, autLen
VLD1.P 16(aut), [B0.B16]
dataMul:
VREV64 B0.B16, B0.B16
VEOR ACC0.B16, B0.B16, B0.B16
VEXT $8, B0.B16, B0.B16, T0.B16
VEOR B0.B16, T0.B16, T0.B16
VPMULL B0.D1, T1.D1, ACC1.Q1
VPMULL2 B0.D2, T1.D2, ACC0.Q1
VPMULL T0.D1, T2.D1, ACCM.Q1
reduce()
B singlesLoop
dataEnd:
CBZ autLen, dataBail
VEOR B0.B16, B0.B16, B0.B16
ADD autLen, aut
dataLoadLoop:
MOVB.W -1(aut), H0
VEXT $15, B0.B16, ZERO.B16, B0.B16
VMOV H0, B0.B[0]
SUBS $1, autLen
BNE dataLoadLoop
B dataMul
dataBail:
VST1 [ACC0.B16], (tPtr)
RET
#undef pTbl
#undef aut
#undef tPtr
#undef autLen
#undef H0
#undef pTblSave