mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
sm4: arm64 transpose matrix use VZIP1 VZIP2
This commit is contained in:
parent
9b364dca8b
commit
fd34c2eff2
@ -82,22 +82,15 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
|||||||
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
|
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
|
||||||
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
|
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
|
||||||
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
|
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
|
||||||
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
|
||||||
VMOV t0.B16, K.B16 \
|
VZIP1 t1.S4, t0.S4, RTMP0.S4 \
|
||||||
VMOV t1.S[0], t0.S[1] \
|
VZIP1 t3.S4, t2.S4, RTMP1.S4 \
|
||||||
VMOV t2.S[0], t0.S[2] \
|
VZIP2 t1.S4, t0.S4, RTMP2.S4 \
|
||||||
VMOV t3.S[0], t0.S[3] \
|
VZIP2 t3.S4, t2.S4, RTMP3.S4 \
|
||||||
VMOV K.S[1], t1.S[0] \
|
VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \
|
||||||
VMOV K.S[2], t2.S[0] \
|
VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \
|
||||||
VMOV K.S[3], t3.S[0] \
|
VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \
|
||||||
VMOV t1.D[1], K.D[1] \
|
VZIP2 RTMP3.D2, RTMP2.D2, t3.D2
|
||||||
VMOV t2.S[1], t1.S[2] \
|
|
||||||
VMOV t3.S[1], t1.S[3] \
|
|
||||||
VMOV K.S[2], t2.S[1] \
|
|
||||||
VMOV K.S[3], t3.S[1] \
|
|
||||||
VMOV t2.S[3], K.S[3] \
|
|
||||||
VMOV t3.S[2], t2.S[3] \
|
|
||||||
VMOV K.S[3], t3.S[2]
|
|
||||||
|
|
||||||
// input: from high to low
|
// input: from high to low
|
||||||
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
||||||
@ -109,25 +102,15 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
|||||||
// t1 = t0.S1, t1.S1, t2.S1, t3.S1
|
// t1 = t0.S1, t1.S1, t2.S1, t3.S1
|
||||||
// t2 = t0.S2, t1.S2, t2.S2, t3.S2
|
// t2 = t0.S2, t1.S2, t2.S2, t3.S2
|
||||||
// t3 = t0.S3, t1.S3, t2.S3, t3.S3
|
// t3 = t0.S3, t1.S3, t2.S3, t3.S3
|
||||||
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
|
||||||
VMOV t0.B16, K.B16 \
|
VZIP1 t0.S4, t1.S4, RTMP0.S4 \
|
||||||
VMOV t3.S[0], t0.S[0] \
|
VZIP2 t0.S4, t1.S4, RTMP1.S4 \
|
||||||
VMOV t2.S[0], t0.S[1] \
|
VZIP1 t2.S4, t3.S4, RTMP2.S4 \
|
||||||
VMOV t1.S[0], t0.S[2] \
|
VZIP2 t2.S4, t3.S4, RTMP3.S4 \
|
||||||
VMOV K0.S[0], t0.S[3] \
|
VZIP1 RTMP0.D2, RTMP2.D2, t0.D2 \
|
||||||
VMOV t3.S[1], t1.S[0] \
|
VZIP2 RTMP0.D2, RTMP2.D2, t1.D2 \
|
||||||
VMOV t3.S[2], t2.S[0] \
|
VZIP1 RTMP1.D2, RTMP3.D2, t2.D2 \
|
||||||
VMOV t3.S[3], t3.S[0] \
|
VZIP2 RTMP1.D2, RTMP3.D2, t3.D2
|
||||||
VMOV t2.S[3], t3.S[1] \
|
|
||||||
VMOV t1.S[3], t3.S[2] \
|
|
||||||
VMOV K.S[3], t3.S[3] \
|
|
||||||
VMOV K.S[2], t2.S[3] \
|
|
||||||
VMOV K.S[1], t1.S[3] \
|
|
||||||
VMOV t1.B16, K.B16 \
|
|
||||||
VMOV t2.S[1], t1.S[1] \
|
|
||||||
VMOV K.S[1], t1.S[2] \
|
|
||||||
VMOV t2.S[2], t2.S[1] \
|
|
||||||
VMOV K.S[2], t2.S[2]
|
|
||||||
|
|
||||||
// SM4 sbox function
|
// SM4 sbox function
|
||||||
// parameters:
|
// parameters:
|
||||||
|
@ -448,7 +448,7 @@ encOctetsLoop:
|
|||||||
VADD B7.S4, INC.S4, CTR.S4
|
VADD B7.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -465,9 +465,9 @@ encOctetsEnc4Blocks1:
|
|||||||
VREV32 B1.B16, B1.B16
|
VREV32 B1.B16, B1.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
// encryption second 4 blocks
|
// encryption second 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
encOctetsEnc4Blocks2:
|
encOctetsEnc4Blocks2:
|
||||||
@ -483,7 +483,7 @@ encOctetsEnc4Blocks2:
|
|||||||
VREV32 B5.B16, B5.B16
|
VREV32 B5.B16, B5.B16
|
||||||
VREV32 B6.B16, B6.B16
|
VREV32 B6.B16, B6.B16
|
||||||
VREV32 B7.B16, B7.B16
|
VREV32 B7.B16, B7.B16
|
||||||
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
|
|
||||||
// XOR plaintext and store ciphertext
|
// XOR plaintext and store ciphertext
|
||||||
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
VLD1.P 32(srcPtr), [T1.B16, T2.B16]
|
||||||
@ -543,7 +543,7 @@ encNibblesLoop:
|
|||||||
VADD B3.S4, INC.S4, CTR.S4
|
VADD B3.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -560,7 +560,7 @@ encNibblesEnc4Blocks:
|
|||||||
VREV32 B1.B16, B1.B16
|
VREV32 B1.B16, B1.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
|
|
||||||
// XOR plaintext and store ciphertext
|
// XOR plaintext and store ciphertext
|
||||||
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
||||||
@ -588,7 +588,7 @@ encStartSingles:
|
|||||||
VADD B3.S4, INC.S4, CTR.S4
|
VADD B3.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -605,7 +605,7 @@ encSinglesEnc4Blocks:
|
|||||||
VREV32 B1.B16, B1.B16
|
VREV32 B1.B16, B1.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
|
|
||||||
VMOV B0.B16, K0.B16
|
VMOV B0.B16, K0.B16
|
||||||
CMP $16, srcPtrLen
|
CMP $16, srcPtrLen
|
||||||
@ -740,7 +740,7 @@ decOctetsLoop:
|
|||||||
VADD B7.S4, INC.S4, CTR.S4
|
VADD B7.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -757,10 +757,10 @@ decOctetsEnc4Blocks1:
|
|||||||
VREV32 B1.B16, T2.B16
|
VREV32 B1.B16, T2.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0)
|
TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3)
|
||||||
|
|
||||||
// encryption second 4 blocks
|
// encryption second 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
decOctetsEnc4Blocks2:
|
decOctetsEnc4Blocks2:
|
||||||
@ -776,7 +776,7 @@ decOctetsEnc4Blocks2:
|
|||||||
VREV32 B5.B16, B5.B16
|
VREV32 B5.B16, B5.B16
|
||||||
VREV32 B6.B16, B6.B16
|
VREV32 B6.B16, B6.B16
|
||||||
VREV32 B7.B16, B7.B16
|
VREV32 B7.B16, B7.B16
|
||||||
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0)
|
TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3)
|
||||||
|
|
||||||
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
VLD1.P 32(srcPtr), [B0.B16, B1.B16]
|
||||||
VEOR B0.B16, T1.B16, T1.B16
|
VEOR B0.B16, T1.B16, T1.B16
|
||||||
@ -837,7 +837,7 @@ decNibblesLoop:
|
|||||||
VADD B3.S4, INC.S4, CTR.S4
|
VADD B3.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -854,7 +854,7 @@ decNibblesEnc4Blocks:
|
|||||||
VREV32 B1.B16, B1.B16
|
VREV32 B1.B16, B1.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
|
|
||||||
// XOR plaintext and store ciphertext
|
// XOR plaintext and store ciphertext
|
||||||
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
VLD1.P 32(srcPtr), [K1.B16, K2.B16]
|
||||||
@ -885,7 +885,7 @@ decStartSingles:
|
|||||||
VADD B3.S4, INC.S4, CTR.S4
|
VADD B3.S4, INC.S4, CTR.S4
|
||||||
|
|
||||||
// encryption first 4 blocks
|
// encryption first 4 blocks
|
||||||
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
EOR R13, R13
|
EOR R13, R13
|
||||||
MOVD rkSave, rk
|
MOVD rkSave, rk
|
||||||
|
|
||||||
@ -902,7 +902,7 @@ decSinglesEnc4Blocks:
|
|||||||
VREV32 B1.B16, B1.B16
|
VREV32 B1.B16, B1.B16
|
||||||
VREV32 B2.B16, B2.B16
|
VREV32 B2.B16, B2.B16
|
||||||
VREV32 B3.B16, B3.B16
|
VREV32 B3.B16, B3.B16
|
||||||
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0)
|
TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3)
|
||||||
|
|
||||||
VMOV B0.B16, K0.B16
|
VMOV B0.B16, K0.B16
|
||||||
CMP $16, srcPtrLen
|
CMP $16, srcPtrLen
|
||||||
|
Loading…
x
Reference in New Issue
Block a user