mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 04:36:19 +08:00
sm4ni gcm
This commit is contained in:
parent
0450200249
commit
5744b64c56
@ -252,7 +252,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
|
||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
#define pTbl R0
|
||||
#define RK R1
|
||||
@ -260,6 +260,7 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
|
||||
MOVD productTable+0(FP), pTbl
|
||||
MOVD rk+8(FP), RK
|
||||
MOVD inst+16(FP), R5
|
||||
|
||||
MOVD $0xC2, I
|
||||
LSL $56, I
|
||||
@ -269,6 +270,9 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
// Encrypt block 0 with the SM4 keys to generate the hash key H
|
||||
CMP $1, R5
|
||||
BEQ sm4InitSM4E
|
||||
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
VEOR B0.B16, B0.B16, B0.B16
|
||||
VEOR B1.B16, B1.B16, B1.B16
|
||||
@ -290,7 +294,22 @@ sm4InitEncLoop:
|
||||
VMOV B1.S[0], B0.S[3]
|
||||
VMOV B2.S[0], B0.S[0]
|
||||
VMOV B3.S[0], B0.S[1]
|
||||
|
||||
B sm4InitEncDone
|
||||
sm4InitSM4E:
|
||||
VEOR B0.B16, B0.B16, B0.B16
|
||||
VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
|
||||
WORD 0x6085c0ce //SM4E V0.4S, V11.4S
|
||||
WORD 0x8085c0ce //SM4E V0.4S, V12.4S
|
||||
WORD 0xa085c0ce //SM4E V0.4S, V13.4S
|
||||
WORD 0xc085c0ce //SM4E V0.4S, V14.4S
|
||||
VLD1.P 64(RK), [T0.S4, T1.S4, T2.S4, T3.S4]
|
||||
WORD 0x6085c0ce //SM4E V0.4S, V11.4S
|
||||
WORD 0x8085c0ce //SM4E V0.4S, V12.4S
|
||||
WORD 0xa085c0ce //SM4E V0.4S, V13.4S
|
||||
WORD 0xc085c0ce //SM4E V0.4S, V14.4S
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV64 B0.B16, B0.B16
|
||||
sm4InitEncDone:
|
||||
// Multiply by 2 modulo P
|
||||
VMOV B0.D[0], I
|
||||
ASR $63, I
|
||||
@ -547,6 +566,7 @@ TEXT ·gcmSm4Enc(SB),NOSPLIT,$0
|
||||
VMOV H0, INC.S[3]
|
||||
VREV32 CTR.B16, CTR.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
|
||||
// Skip to <8 blocks loop
|
||||
CMP $128, srcPtrLen
|
||||
|
||||
@ -587,7 +607,7 @@ encOctetsEnc4Blocks1:
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
||||
// encryption first 4 blocks
|
||||
// encryption second 4 blocks
|
||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
||||
MOVD rkSave, rk
|
||||
|
||||
@ -880,7 +900,7 @@ decOctetsEnc4Blocks1:
|
||||
VREV32 B3.B16, B3.B16
|
||||
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)
|
||||
|
||||
// encryption first 4 blocks
|
||||
// encryption second 4 blocks
|
||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
||||
MOVD rkSave, rk
|
||||
|
||||
|
610
sm4/gcm_sm4ni_arm64.s
Normal file
610
sm4/gcm_sm4ni_arm64.s
Normal file
@ -0,0 +1,610 @@
|
||||
#include "textflag.h"
|
||||
|
||||
#define B0 V0
|
||||
#define B1 V1
|
||||
#define B2 V2
|
||||
#define B3 V3
|
||||
#define B4 V4
|
||||
#define B5 V5
|
||||
#define B6 V6
|
||||
#define B7 V7
|
||||
|
||||
#define ACC0 V8
|
||||
#define ACC1 V9
|
||||
#define ACCM V10
|
||||
|
||||
#define T0 V11
|
||||
#define T1 V12
|
||||
#define T2 V13
|
||||
#define T3 V14
|
||||
|
||||
#define POLY V15
|
||||
#define ZERO V16
|
||||
#define INC V17
|
||||
#define CTR V18
|
||||
|
||||
#define K0 V19
|
||||
#define K1 V20
|
||||
#define K2 V21
|
||||
#define K3 V22
|
||||
#define K4 V23
|
||||
#define K5 V24
|
||||
#define K6 V25
|
||||
#define K7 V26
|
||||
|
||||
#define reduce() \
|
||||
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
||||
VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
|
||||
VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
|
||||
VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
|
||||
VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
|
||||
VEOR T0.B16, ACC1.B16, ACC1.B16 \
|
||||
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
|
||||
VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
|
||||
VEOR T0.B16, ACC0.B16, ACC0.B16 \
|
||||
VPMULL POLY.D1, ACC0.D1, T0.Q1 \
|
||||
VEOR T0.B16, ACC1.B16, ACC1.B16 \
|
||||
VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
|
||||
VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
|
||||
|
||||
#define mulRound(X) \
|
||||
VLD1.P 32(pTbl), [T1.B16, T2.B16] \
|
||||
VREV64 X.B16, X.B16 \
|
||||
VEXT $8, X.B16, X.B16, T0.B16 \
|
||||
VEOR X.B16, T0.B16, T0.B16 \
|
||||
VPMULL X.D1, T1.D1, T3.Q1 \
|
||||
VEOR T3.B16, ACC1.B16, ACC1.B16 \
|
||||
VPMULL2 X.D2, T1.D2, T3.Q1 \
|
||||
VEOR T3.B16, ACC0.B16, ACC0.B16 \
|
||||
VPMULL T0.D1, T2.D1, T3.Q1 \
|
||||
VEOR T3.B16, ACCM.B16, ACCM.B16
|
||||
|
||||
// func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0
|
||||
#define pTbl R0
|
||||
#define dstPtr R1
|
||||
#define ctrPtr R2
|
||||
#define srcPtr R3
|
||||
#define rk R4
|
||||
#define tPtr R5
|
||||
#define srcPtrLen R6
|
||||
#define aluCTR R7
|
||||
#define aluTMP R8
|
||||
#define H0 R9
|
||||
#define H1 R10
|
||||
#define pTblSave R11
|
||||
MOVD productTable+0(FP), pTbl
|
||||
MOVD dst+8(FP), dstPtr
|
||||
MOVD src_base+32(FP), srcPtr
|
||||
MOVD src_len+40(FP), srcPtrLen
|
||||
MOVD ctr+56(FP), ctrPtr
|
||||
MOVD T+64(FP), tPtr
|
||||
MOVD rk_base+72(FP), rk
|
||||
|
||||
MOVD $0xC2, H1
|
||||
LSL $56, H1
|
||||
MOVD $1, H0
|
||||
VMOV H1, POLY.D[0]
|
||||
VMOV H0, POLY.D[1]
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
MOVD pTbl, pTblSave
|
||||
// Current tag, after AAD
|
||||
VLD1 (tPtr), [ACC0.B16]
|
||||
VEOR ACC1.B16, ACC1.B16, ACC1.B16
|
||||
VEOR ACCM.B16, ACCM.B16, ACCM.B16
|
||||
// Prepare initial counter, and the increment vector
|
||||
VLD1 (ctrPtr), [CTR.B16]
|
||||
VEOR INC.B16, INC.B16, INC.B16
|
||||
MOVD $1, H0
|
||||
VMOV H0, INC.S[3]
|
||||
VREV32 CTR.B16, CTR.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
|
||||
// Skip to <8 blocks loop
|
||||
CMP $128, srcPtrLen
|
||||
|
||||
MOVD rk, H0
|
||||
// For SM4 round keys are stored in: K0 .. K7
|
||||
VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||
VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||
|
||||
BLT startSingles
|
||||
octetsLoop:
|
||||
SUB $128, srcPtrLen
|
||||
// Prepare 8 counters
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VADD B4.S4, INC.S4, B5.S4
|
||||
VADD B5.S4, INC.S4, B6.S4
|
||||
VADD B6.S4, INC.S4, B7.S4
|
||||
VADD B7.S4, INC.S4, CTR.S4
|
||||
|
||||
WORD 0x6086c0ce //SM4E V0.4S, V19.4S
|
||||
WORD 0x8086c0ce //SM4E V0.4S, V20.4S
|
||||
WORD 0xa086c0ce //SM4E V0.4S, V21.4S
|
||||
WORD 0xc086c0ce //SM4E V0.4S, V22.4S
|
||||
WORD 0xe086c0ce //SM4E V0.4S, V23.4S
|
||||
WORD 0x0087c0ce //SM4E V0.4S, V24.4S
|
||||
WORD 0x2087c0ce //SM4E V0.4S, V25.4S
|
||||
WORD 0x4087c0ce //SM4E V0.4S, V26.4S
|
||||
WORD 0x6186c0ce //SM4E V1.4S, V19.4S
|
||||
WORD 0x8186c0ce //SM4E V1.4S, V20.4S
|
||||
WORD 0xa186c0ce //SM4E V1.4S, V21.4S
|
||||
WORD 0xc186c0ce //SM4E V1.4S, V22.4S
|
||||
WORD 0xe186c0ce //SM4E V1.4S, V23.4S
|
||||
WORD 0x0187c0ce //SM4E V1.4S, V24.4S
|
||||
WORD 0x2187c0ce //SM4E V1.4S, V25.4S
|
||||
WORD 0x4187c0ce //SM4E V1.4S, V26.4S
|
||||
WORD 0x6286c0ce //SM4E V2.4S, V19.4S
|
||||
WORD 0x8286c0ce //SM4E V2.4S, V20.4S
|
||||
WORD 0xa286c0ce //SM4E V2.4S, V21.4S
|
||||
WORD 0xc286c0ce //SM4E V2.4S, V22.4S
|
||||
WORD 0xe286c0ce //SM4E V2.4S, V23.4S
|
||||
WORD 0x0287c0ce //SM4E V2.4S, V24.4S
|
||||
WORD 0x2287c0ce //SM4E V2.4S, V25.4S
|
||||
WORD 0x4287c0ce //SM4E V2.4S, V26.4S
|
||||
WORD 0x6386c0ce //SM4E V3.4S, V19.4S
|
||||
WORD 0x8386c0ce //SM4E V3.4S, V20.4S
|
||||
WORD 0xa386c0ce //SM4E V3.4S, V21.4S
|
||||
WORD 0xc386c0ce //SM4E V3.4S, V22.4S
|
||||
WORD 0xe386c0ce //SM4E V3.4S, V23.4S
|
||||
WORD 0x0387c0ce //SM4E V3.4S, V24.4S
|
||||
WORD 0x2387c0ce //SM4E V3.4S, V25.4S
|
||||
WORD 0x4387c0ce //SM4E V3.4S, V26.4S
|
||||
WORD 0x6486c0ce //SM4E V4.4S, V19.4S
|
||||
WORD 0x8486c0ce //SM4E V4.4S, V20.4S
|
||||
WORD 0xa486c0ce //SM4E V4.4S, V21.4S
|
||||
WORD 0xc486c0ce //SM4E V4.4S, V22.4S
|
||||
WORD 0xe486c0ce //SM4E V4.4S, V23.4S
|
||||
WORD 0x0487c0ce //SM4E V4.4S, V24.4S
|
||||
WORD 0x2487c0ce //SM4E V4.4S, V25.4S
|
||||
WORD 0x4487c0ce //SM4E V4.4S, V26.4S
|
||||
WORD 0x6586c0ce //SM4E V5.4S, V19.4S
|
||||
WORD 0x8586c0ce //SM4E V5.4S, V20.4S
|
||||
WORD 0xa586c0ce //SM4E V5.4S, V21.4S
|
||||
WORD 0xc586c0ce //SM4E V5.4S, V22.4S
|
||||
WORD 0xe586c0ce //SM4E V5.4S, V23.4S
|
||||
WORD 0x0587c0ce //SM4E V5.4S, V24.4S
|
||||
WORD 0x2587c0ce //SM4E V5.4S, V25.4S
|
||||
WORD 0x4587c0ce //SM4E V5.4S, V26.4S
|
||||
WORD 0x6686c0ce //SM4E V6.4S, V19.4S
|
||||
WORD 0x8686c0ce //SM4E V6.4S, V20.4S
|
||||
WORD 0xa686c0ce //SM4E V6.4S, V21.4S
|
||||
WORD 0xc686c0ce //SM4E V6.4S, V22.4S
|
||||
WORD 0xe686c0ce //SM4E V6.4S, V23.4S
|
||||
WORD 0x0687c0ce //SM4E V6.4S, V24.4S
|
||||
WORD 0x2687c0ce //SM4E V6.4S, V25.4S
|
||||
WORD 0x4687c0ce //SM4E V6.4S, V26.4S
|
||||
WORD 0x6786c0ce //SM4E V7.4S, V19.4S
|
||||
WORD 0x8786c0ce //SM4E V7.4S, V20.4S
|
||||
WORD 0xa786c0ce //SM4E V7.4S, V21.4S
|
||||
WORD 0xc786c0ce //SM4E V7.4S, V22.4S
|
||||
WORD 0xe786c0ce //SM4E V7.4S, V23.4S
|
||||
WORD 0x0787c0ce //SM4E V7.4S, V24.4S
|
||||
WORD 0x2787c0ce //SM4E V7.4S, V25.4S
|
||||
WORD 0x4787c0ce //SM4E V7.4S, V26.4S
|
||||
VREV32 B0.B16, B0.B16
|
||||
VREV32 B1.B16, B1.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
VREV32 B4.B16, B4.B16
|
||||
VREV32 B5.B16, B5.B16
|
||||
VREV32 B6.B16, B6.B16
|
||||
VREV32 B7.B16, B7.B16
|
||||
|
||||
// XOR plaintext and store ciphertext
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B0.B16, T1.B16, B0.B16
|
||||
VEOR B1.B16, T2.B16, B1.B16
|
||||
VST1.P [B0.B16, B1.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B2.B16, T1.B16, B2.B16
|
||||
VEOR B3.B16, T2.B16, B3.B16
|
||||
VST1.P [B2.B16, B3.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B4.B16, T1.B16, B4.B16
|
||||
VEOR B5.B16, T2.B16, B5.B16
|
||||
VST1.P [B4.B16, B5.B16], 32(dstPtr)
|
||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||
VEOR B6.B16, T1.B16, B6.B16
|
||||
VEOR B7.B16, T2.B16, B7.B16
|
||||
VST1.P [B6.B16, B7.B16], 32(dstPtr)
|
||||
|
||||
VLD1.P 32(pTbl), [T1.B16, T2.B16]
|
||||
VREV64 B0.B16, B0.B16
|
||||
VEOR ACC0.B16, B0.B16, B0.B16
|
||||
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||
VEOR B0.B16, T0.B16, T0.B16
|
||||
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
|
||||
mulRound(B1)
|
||||
mulRound(B2)
|
||||
mulRound(B3)
|
||||
mulRound(B4)
|
||||
mulRound(B5)
|
||||
mulRound(B6)
|
||||
mulRound(B7)
|
||||
MOVD pTblSave, pTbl
|
||||
reduce()
|
||||
|
||||
CMP $128, srcPtrLen
|
||||
BGE octetsLoop
|
||||
|
||||
startSingles:
|
||||
CBZ srcPtrLen, done
|
||||
ADD $14*16, pTbl
|
||||
// Preload H and its Karatsuba precomp
|
||||
VLD1.P (pTbl), [T1.B16, T2.B16]
|
||||
|
||||
singlesLoop:
|
||||
CMP $16, srcPtrLen
|
||||
BLT tail
|
||||
SUB $16, srcPtrLen
|
||||
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
// SM4 8 rounds
|
||||
WORD 0x6086c0ce //SM4E V0.4S, V19.4S
|
||||
WORD 0x8086c0ce //SM4E V0.4S, V20.4S
|
||||
WORD 0xa086c0ce //SM4E V0.4S, V21.4S
|
||||
WORD 0xc086c0ce //SM4E V0.4S, V22.4S
|
||||
WORD 0xe086c0ce //SM4E V0.4S, V23.4S
|
||||
WORD 0x0087c0ce //SM4E V0.4S, V24.4S
|
||||
WORD 0x2087c0ce //SM4E V0.4S, V25.4S
|
||||
WORD 0x4087c0ce //SM4E V0.4S, V26.4S
|
||||
VREV32 B0.B16, B0.B16
|
||||
|
||||
singlesLast:
|
||||
VLD1.P 16(srcPtr), [T0.B16]
|
||||
VEOR T0.B16, B0.B16, B0.B16
|
||||
|
||||
encReduce:
|
||||
VST1.P [B0.B16], 16(dstPtr)
|
||||
|
||||
VREV64 B0.B16, B0.B16
|
||||
VEOR ACC0.B16, B0.B16, B0.B16
|
||||
|
||||
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||
VEOR B0.B16, T0.B16, T0.B16
|
||||
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
|
||||
reduce()
|
||||
|
||||
B singlesLoop
|
||||
tail:
|
||||
CBZ srcPtrLen, done
|
||||
|
||||
VEOR T0.B16, T0.B16, T0.B16
|
||||
VEOR T3.B16, T3.B16, T3.B16
|
||||
MOVD $0, H1
|
||||
SUB $1, H1
|
||||
ADD srcPtrLen, srcPtr
|
||||
|
||||
TBZ $3, srcPtrLen, ld4
|
||||
MOVD.W -8(srcPtr), H0
|
||||
VMOV H0, T0.D[0]
|
||||
VMOV H1, T3.D[0]
|
||||
|
||||
ld4:
|
||||
TBZ $2, srcPtrLen, ld2
|
||||
MOVW.W -4(srcPtr), H0
|
||||
VEXT $12, T0.B16, ZERO.B16, T0.B16
|
||||
VEXT $12, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H0, T0.S[0]
|
||||
VMOV H1, T3.S[0]
|
||||
ld2:
|
||||
TBZ $1, srcPtrLen, ld1
|
||||
MOVH.W -2(srcPtr), H0
|
||||
VEXT $14, T0.B16, ZERO.B16, T0.B16
|
||||
VEXT $14, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H0, T0.H[0]
|
||||
VMOV H1, T3.H[0]
|
||||
ld1:
|
||||
TBZ $0, srcPtrLen, ld0
|
||||
MOVB.W -1(srcPtr), H0
|
||||
VEXT $15, T0.B16, ZERO.B16, T0.B16
|
||||
VEXT $15, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H0, T0.B[0]
|
||||
VMOV H1, T3.B[0]
|
||||
ld0:
|
||||
MOVD ZR, srcPtrLen
|
||||
VMOV CTR.B16, B0.B16
|
||||
// SM4 8 rounds
|
||||
WORD 0x6086c0ce //SM4E V0.4S, V19.4S
|
||||
WORD 0x8086c0ce //SM4E V0.4S, V20.4S
|
||||
WORD 0xa086c0ce //SM4E V0.4S, V21.4S
|
||||
WORD 0xc086c0ce //SM4E V0.4S, V22.4S
|
||||
WORD 0xe086c0ce //SM4E V0.4S, V23.4S
|
||||
WORD 0x0087c0ce //SM4E V0.4S, V24.4S
|
||||
WORD 0x2087c0ce //SM4E V0.4S, V25.4S
|
||||
WORD 0x4087c0ce //SM4E V0.4S, V26.4S
|
||||
VREV32 B0.B16, B0.B16
|
||||
|
||||
tailLast:
|
||||
VEOR T0.B16, B0.B16, B0.B16
|
||||
VAND T3.B16, B0.B16, B0.B16
|
||||
B encReduce
|
||||
|
||||
done:
|
||||
VST1 [ACC0.B16], (tPtr)
|
||||
RET
|
||||
|
||||
// func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
TEXT ·gcmSm4niDec(SB),NOSPLIT,$0
|
||||
MOVD productTable+0(FP), pTbl
|
||||
MOVD dst+8(FP), dstPtr
|
||||
MOVD src_base+32(FP), srcPtr
|
||||
MOVD src_len+40(FP), srcPtrLen
|
||||
MOVD ctr+56(FP), ctrPtr
|
||||
MOVD T+64(FP), tPtr
|
||||
MOVD rk_base+72(FP), rk
|
||||
|
||||
MOVD $0xC2, H1
|
||||
LSL $56, H1
|
||||
MOVD $1, H0
|
||||
VMOV H1, POLY.D[0]
|
||||
VMOV H0, POLY.D[1]
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
MOVD pTbl, pTblSave
|
||||
MOVD rk, rkSave
|
||||
// Current tag, after AAD
|
||||
VLD1 (tPtr), [ACC0.B16]
|
||||
VEOR ACC1.B16, ACC1.B16, ACC1.B16
|
||||
VEOR ACCM.B16, ACCM.B16, ACCM.B16
|
||||
// Prepare initial counter, and the increment vector
|
||||
VLD1 (ctrPtr), [CTR.B16]
|
||||
VEOR INC.B16, INC.B16, INC.B16
|
||||
MOVD $1, H0
|
||||
VMOV H0, INC.S[3]
|
||||
VREV32 CTR.B16, CTR.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
|
||||
// Skip to <8 blocks loop
|
||||
CMP $128, srcPtrLen
|
||||
|
||||
MOVD rk, H0
|
||||
// For SM4 round keys are stored in: K0 .. K7
|
||||
VLD1.P 64(H0), [K0.S4, K1.S4, K2.S4, K3.S4]
|
||||
VLD1.P 64(H0), [K4.S4, K5.S4, K6.S4, K7.S4]
|
||||
|
||||
BLT startSingles
|
||||
octetsLoop:
|
||||
SUB $128, srcPtrLen
|
||||
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD B0.S4, INC.S4, B1.S4
|
||||
VADD B1.S4, INC.S4, B2.S4
|
||||
VADD B2.S4, INC.S4, B3.S4
|
||||
VADD B3.S4, INC.S4, B4.S4
|
||||
VADD B4.S4, INC.S4, B5.S4
|
||||
VADD B5.S4, INC.S4, B6.S4
|
||||
VADD B6.S4, INC.S4, B7.S4
|
||||
VADD B7.S4, INC.S4, CTR.S4
|
||||
|
||||
WORD 0x6086c0ce //SM4E V0.4S, V19.4S
|
||||
WORD 0x8086c0ce //SM4E V0.4S, V20.4S
|
||||
WORD 0xa086c0ce //SM4E V0.4S, V21.4S
|
||||
WORD 0xc086c0ce //SM4E V0.4S, V22.4S
|
||||
WORD 0xe086c0ce //SM4E V0.4S, V23.4S
|
||||
WORD 0x0087c0ce //SM4E V0.4S, V24.4S
|
||||
WORD 0x2087c0ce //SM4E V0.4S, V25.4S
|
||||
WORD 0x4087c0ce //SM4E V0.4S, V26.4S
|
||||
WORD 0x6186c0ce //SM4E V1.4S, V19.4S
|
||||
WORD 0x8186c0ce //SM4E V1.4S, V20.4S
|
||||
WORD 0xa186c0ce //SM4E V1.4S, V21.4S
|
||||
WORD 0xc186c0ce //SM4E V1.4S, V22.4S
|
||||
WORD 0xe186c0ce //SM4E V1.4S, V23.4S
|
||||
WORD 0x0187c0ce //SM4E V1.4S, V24.4S
|
||||
WORD 0x2187c0ce //SM4E V1.4S, V25.4S
|
||||
WORD 0x4187c0ce //SM4E V1.4S, V26.4S
|
||||
WORD 0x6286c0ce //SM4E V2.4S, V19.4S
|
||||
WORD 0x8286c0ce //SM4E V2.4S, V20.4S
|
||||
WORD 0xa286c0ce //SM4E V2.4S, V21.4S
|
||||
WORD 0xc286c0ce //SM4E V2.4S, V22.4S
|
||||
WORD 0xe286c0ce //SM4E V2.4S, V23.4S
|
||||
WORD 0x0287c0ce //SM4E V2.4S, V24.4S
|
||||
WORD 0x2287c0ce //SM4E V2.4S, V25.4S
|
||||
WORD 0x4287c0ce //SM4E V2.4S, V26.4S
|
||||
WORD 0x6386c0ce //SM4E V3.4S, V19.4S
|
||||
WORD 0x8386c0ce //SM4E V3.4S, V20.4S
|
||||
WORD 0xa386c0ce //SM4E V3.4S, V21.4S
|
||||
WORD 0xc386c0ce //SM4E V3.4S, V22.4S
|
||||
WORD 0xe386c0ce //SM4E V3.4S, V23.4S
|
||||
WORD 0x0387c0ce //SM4E V3.4S, V24.4S
|
||||
WORD 0x2387c0ce //SM4E V3.4S, V25.4S
|
||||
WORD 0x4387c0ce //SM4E V3.4S, V26.4S
|
||||
WORD 0x6486c0ce //SM4E V4.4S, V19.4S
|
||||
WORD 0x8486c0ce //SM4E V4.4S, V20.4S
|
||||
WORD 0xa486c0ce //SM4E V4.4S, V21.4S
|
||||
WORD 0xc486c0ce //SM4E V4.4S, V22.4S
|
||||
WORD 0xe486c0ce //SM4E V4.4S, V23.4S
|
||||
WORD 0x0487c0ce //SM4E V4.4S, V24.4S
|
||||
WORD 0x2487c0ce //SM4E V4.4S, V25.4S
|
||||
WORD 0x4487c0ce //SM4E V4.4S, V26.4S
|
||||
WORD 0x6586c0ce //SM4E V5.4S, V19.4S
|
||||
WORD 0x8586c0ce //SM4E V5.4S, V20.4S
|
||||
WORD 0xa586c0ce //SM4E V5.4S, V21.4S
|
||||
WORD 0xc586c0ce //SM4E V5.4S, V22.4S
|
||||
WORD 0xe586c0ce //SM4E V5.4S, V23.4S
|
||||
WORD 0x0587c0ce //SM4E V5.4S, V24.4S
|
||||
WORD 0x2587c0ce //SM4E V5.4S, V25.4S
|
||||
WORD 0x4587c0ce //SM4E V5.4S, V26.4S
|
||||
WORD 0x6686c0ce //SM4E V6.4S, V19.4S
|
||||
WORD 0x8686c0ce //SM4E V6.4S, V20.4S
|
||||
WORD 0xa686c0ce //SM4E V6.4S, V21.4S
|
||||
WORD 0xc686c0ce //SM4E V6.4S, V22.4S
|
||||
WORD 0xe686c0ce //SM4E V6.4S, V23.4S
|
||||
WORD 0x0687c0ce //SM4E V6.4S, V24.4S
|
||||
WORD 0x2687c0ce //SM4E V6.4S, V25.4S
|
||||
WORD 0x4687c0ce //SM4E V6.4S, V26.4S
|
||||
WORD 0x6786c0ce //SM4E V7.4S, V19.4S
|
||||
WORD 0x8786c0ce //SM4E V7.4S, V20.4S
|
||||
WORD 0xa786c0ce //SM4E V7.4S, V21.4S
|
||||
WORD 0xc786c0ce //SM4E V7.4S, V22.4S
|
||||
WORD 0xe786c0ce //SM4E V7.4S, V23.4S
|
||||
WORD 0x0787c0ce //SM4E V7.4S, V24.4S
|
||||
WORD 0x2787c0ce //SM4E V7.4S, V25.4S
|
||||
WORD 0x4787c0ce //SM4E V7.4S, V26.4S
|
||||
VREV32 B0.B16, T1.B16
|
||||
VREV32 B1.B16, T2.B16
|
||||
VREV32 B2.B16, B2.B16
|
||||
VREV32 B3.B16, B3.B16
|
||||
VREV32 B4.B16, B4.B16
|
||||
VREV32 B5.B16, B5.B16
|
||||
VREV32 B6.B16, B6.B16
|
||||
VREV32 B7.B16, B7.B16
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B0.B16, T1.B16, T1.B16
|
||||
VEOR B1.B16, T2.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
|
||||
VLD1.P 32(pTbl), [T1.B16, T2.B16]
|
||||
VREV64 B0.B16, B0.B16
|
||||
VEOR ACC0.B16, B0.B16, B0.B16
|
||||
VEXT $8, B0.B16, B0.B16, T0.B16
|
||||
VEOR B0.B16, T0.B16, T0.B16
|
||||
VPMULL B0.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B0.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
mulRound(B1)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B2.B16, B0.B16, T1.B16
|
||||
VEOR B3.B16, B1.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
mulRound(B0)
|
||||
mulRound(B1)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B4.B16, B0.B16, T1.B16
|
||||
VEOR B5.B16, B1.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
mulRound(B0)
|
||||
mulRound(B1)
|
||||
|
||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||
VEOR B6.B16, B0.B16, T1.B16
|
||||
VEOR B7.B16, B1.B16, T2.B16
|
||||
VST1.P [T1.B16, T2.B16], 32(dstPtr)
|
||||
mulRound(B0)
|
||||
mulRound(B1)
|
||||
|
||||
MOVD pTblSave, pTbl
|
||||
reduce()
|
||||
|
||||
CMP $128, srcPtrLen
|
||||
BGE octetsLoop
|
||||
|
||||
startSingles:
|
||||
CBZ srcPtrLen, done
|
||||
ADD $14*16, pTbl
|
||||
// Preload H and its Karatsuba precomp
|
||||
VLD1.P (pTbl), [T1.B16, T2.B16]
|
||||
|
||||
singlesLoop:
|
||||
CMP $16, srcPtrLen
|
||||
BLT tail
|
||||
SUB $16, srcPtrLen
|
||||
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
// SM4 8 rounds
|
||||
WORD 0x6086c0ce //SM4E V0.4S, V19.4S
|
||||
WORD 0x8086c0ce //SM4E V0.4S, V20.4S
|
||||
WORD 0xa086c0ce //SM4E V0.4S, V21.4S
|
||||
WORD 0xc086c0ce //SM4E V0.4S, V22.4S
|
||||
WORD 0xe086c0ce //SM4E V0.4S, V23.4S
|
||||
WORD 0x0087c0ce //SM4E V0.4S, V24.4S
|
||||
WORD 0x2087c0ce //SM4E V0.4S, V25.4S
|
||||
WORD 0x4087c0ce //SM4E V0.4S, V26.4S
|
||||
VREV32 B0.B16, B0.B16
|
||||
|
||||
singlesLast:
|
||||
VLD1.P 16(srcPtr), [T0.B16]
|
||||
VEOR T0.B16, B0.B16, B0.B16
|
||||
VST1.P [B0.B16], 16(dstPtr)
|
||||
|
||||
VEOR ACC0.B16, B5.B16, B5.B16
|
||||
VEXT $8, B5.B16, B5.B16, T0.B16
|
||||
VEOR B5.B16, T0.B16, T0.B16
|
||||
VPMULL B5.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B5.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
reduce()
|
||||
|
||||
B singlesLoop
|
||||
tail:
|
||||
CBZ srcPtrLen, done
|
||||
VMOV CTR.B16, B0.B16
|
||||
VADD CTR.S4, INC.S4, CTR.S4
|
||||
// SM4 8 rounds
|
||||
WORD 0x6086c0ce //SM4E V0.4S, V19.4S
|
||||
WORD 0x8086c0ce //SM4E V0.4S, V20.4S
|
||||
WORD 0xa086c0ce //SM4E V0.4S, V21.4S
|
||||
WORD 0xc086c0ce //SM4E V0.4S, V22.4S
|
||||
WORD 0xe086c0ce //SM4E V0.4S, V23.4S
|
||||
WORD 0x0087c0ce //SM4E V0.4S, V24.4S
|
||||
WORD 0x2087c0ce //SM4E V0.4S, V25.4S
|
||||
WORD 0x4087c0ce //SM4E V0.4S, V26.4S
|
||||
VREV32 B0.B16, B0.B16
|
||||
tailLast:
|
||||
// Assuming it is safe to load past dstPtr due to the presence of the tag
|
||||
VLD1 (srcPtr), [B5.B16]
|
||||
|
||||
VEOR B5.B16, B0.B16, B0.B16
|
||||
|
||||
VEOR T3.B16, T3.B16, T3.B16
|
||||
MOVD $0, H1
|
||||
SUB $1, H1
|
||||
|
||||
TBZ $3, srcPtrLen, ld4
|
||||
VMOV B0.D[0], H0
|
||||
MOVD.P H0, 8(dstPtr)
|
||||
VMOV H1, T3.D[0]
|
||||
VEXT $8, ZERO.B16, B0.B16, B0.B16
|
||||
ld4:
|
||||
TBZ $2, srcPtrLen, ld2
|
||||
VMOV B0.S[0], H0
|
||||
MOVW.P H0, 4(dstPtr)
|
||||
VEXT $12, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H1, T3.S[0]
|
||||
VEXT $4, ZERO.B16, B0.B16, B0.B16
|
||||
ld2:
|
||||
TBZ $1, srcPtrLen, ld1
|
||||
VMOV B0.H[0], H0
|
||||
MOVH.P H0, 2(dstPtr)
|
||||
VEXT $14, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H1, T3.H[0]
|
||||
VEXT $2, ZERO.B16, B0.B16, B0.B16
|
||||
ld1:
|
||||
TBZ $0, srcPtrLen, ld0
|
||||
VMOV B0.B[0], H0
|
||||
MOVB.P H0, 1(dstPtr)
|
||||
VEXT $15, T3.B16, ZERO.B16, T3.B16
|
||||
VMOV H1, T3.B[0]
|
||||
ld0:
|
||||
|
||||
VAND T3.B16, B5.B16, B5.B16
|
||||
VREV64 B5.B16, B5.B16
|
||||
|
||||
VEOR ACC0.B16, B5.B16, B5.B16
|
||||
VEXT $8, B5.B16, B5.B16, T0.B16
|
||||
VEOR B5.B16, T0.B16, T0.B16
|
||||
VPMULL B5.D1, T1.D1, ACC1.Q1
|
||||
VPMULL2 B5.D2, T1.D2, ACC0.Q1
|
||||
VPMULL T0.D1, T2.D1, ACCM.Q1
|
||||
reduce()
|
||||
done:
|
||||
VST1 [ACC0.B16], (tPtr)
|
||||
|
||||
RET
|
@ -21,7 +21,7 @@ type sm4CipherGCM struct {
|
||||
var _ gcmAble = (*sm4CipherGCM)(nil)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4Init(productTable *[256]byte, rk []uint32)
|
||||
func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
@ -29,6 +29,12 @@ func gcmSm4Enc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []
|
||||
//go:noescape
|
||||
func gcmSm4Dec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)
|
||||
|
||||
@ -40,6 +46,30 @@ type gcmAsm struct {
|
||||
bytesProductTable [256]byte
|
||||
}
|
||||
|
||||
func gcmSm4InitInst(productTable *[256]byte, rk []uint32) {
|
||||
if supportSM4 {
|
||||
gcmSm4Init(productTable, rk, 1)
|
||||
} else {
|
||||
gcmSm4Init(productTable, rk, 0)
|
||||
}
|
||||
}
|
||||
|
||||
func gcmSm4EncInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
|
||||
if supportSM4 {
|
||||
gcmSm4niEnc(productTable, dst, src, ctr, T, rk)
|
||||
} else {
|
||||
gcmSm4Enc(productTable, dst, src, ctr, T, rk)
|
||||
}
|
||||
}
|
||||
|
||||
func gcmSm4DecInst(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) {
|
||||
if supportSM4 {
|
||||
gcmSm4niDec(productTable, dst, src, ctr, T, rk)
|
||||
} else {
|
||||
gcmSm4Dec(productTable, dst, src, ctr, T, rk)
|
||||
}
|
||||
}
|
||||
|
||||
// NewGCM returns the SM4 cipher wrapped in Galois Counter Mode. This is only
|
||||
// called by crypto/cipher.NewGCM via the gcmAble interface.
|
||||
func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
||||
@ -47,7 +77,7 @@ func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
||||
g.cipher = &c.sm4CipherAsm
|
||||
g.nonceSize = nonceSize
|
||||
g.tagSize = tagSize
|
||||
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
|
||||
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
|
||||
return g, nil
|
||||
}
|
||||
|
||||
@ -92,7 +122,7 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
|
||||
}
|
||||
|
||||
if len(plaintext) > 0 {
|
||||
gcmSm4Enc(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
|
||||
gcmSm4EncInst(&g.bytesProductTable, out, plaintext, &counter, &tagOut, g.cipher.enc)
|
||||
}
|
||||
gcmSm4Finish(&g.bytesProductTable, &tagMask, &tagOut, uint64(len(plaintext)), uint64(len(data)))
|
||||
copy(out[len(plaintext):], tagOut[:])
|
||||
@ -145,7 +175,7 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
|
||||
panic("cipher: invalid buffer overlap")
|
||||
}
|
||||
if len(ciphertext) > 0 {
|
||||
gcmSm4Dec(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
|
||||
gcmSm4DecInst(&g.bytesProductTable, out, ciphertext, &counter, &expectedTag, g.cipher.enc)
|
||||
}
|
||||
gcmSm4Finish(&g.bytesProductTable, &tagMask, &expectedTag, uint64(len(ciphertext)), uint64(len(data)))
|
||||
|
||||
|
@ -12,11 +12,11 @@ import (
|
||||
func genPrecomputeTable() *gcmAsm {
|
||||
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
|
||||
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
|
||||
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
|
||||
expandKey(key, c.enc, c.dec)
|
||||
c1 := &sm4CipherGCM{c}
|
||||
g := &gcmAsm{}
|
||||
g.cipher = &c1.sm4CipherAsm
|
||||
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
|
||||
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
|
||||
return g
|
||||
}
|
||||
|
||||
@ -146,12 +146,12 @@ func TestBothDataPlaintext(t *testing.T) {
|
||||
func createGcm() *gcmAsm {
|
||||
key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10}
|
||||
c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}, 4, 64}
|
||||
expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0])
|
||||
expandKey(key, c.enc, c.dec)
|
||||
c1 := &sm4CipherGCM{c}
|
||||
g := &gcmAsm{}
|
||||
g.cipher = &c1.sm4CipherAsm
|
||||
g.tagSize = 16
|
||||
gcmSm4Init(&g.bytesProductTable, g.cipher.enc)
|
||||
gcmSm4InitInst(&g.bytesProductTable, g.cipher.enc)
|
||||
return g
|
||||
}
|
||||
|
||||
@ -214,7 +214,7 @@ func TestGcmSm4Enc(t *testing.T) {
|
||||
|
||||
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
|
||||
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
|
||||
gcmSm4Enc(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
|
||||
gcmSm4EncInst(&gcm.bytesProductTable, out2, []byte(test.plaintext), &counter2, &tagOut2, gcm.cipher.enc)
|
||||
if hex.EncodeToString(out1) != hex.EncodeToString(out2) {
|
||||
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString(out1), hex.EncodeToString(out2))
|
||||
}
|
||||
@ -244,7 +244,7 @@ func TestGcmSm4Dec(t *testing.T) {
|
||||
|
||||
out2 := make([]byte, len(test.plaintext)+gcm.tagSize)
|
||||
gcmSm4Data(&gcm.bytesProductTable, []byte("emmansun"), &tagOut2)
|
||||
gcmSm4Dec(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
|
||||
gcmSm4DecInst(&gcm.bytesProductTable, out2, out1, &counter2, &tagOut2, gcm.cipher.enc)
|
||||
|
||||
if hex.EncodeToString([]byte(test.plaintext)) != hex.EncodeToString(out2[:len(test.plaintext)]) {
|
||||
t.Errorf("#%d: out expected %s, got %s", i, hex.EncodeToString([]byte(test.plaintext)), hex.EncodeToString(out2[:len(test.plaintext)]))
|
||||
|
Loading…
x
Reference in New Issue
Block a user