From fd34c2eff2f69aac36d5176a8a98c275521b9fda Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 25 Jul 2022 16:37:23 +0800 Subject: [PATCH] sm4: arm64 transpose matrix use VZIP1 VZIP2 --- sm4/aesni_arm64.h | 53 ++++++++++++++++------------------------------- sm4/gcm_arm64.s | 32 ++++++++++++++-------------- 2 files changed, 34 insertions(+), 51 deletions(-) diff --git a/sm4/aesni_arm64.h b/sm4/aesni_arm64.h index 363da9c..c1ad80a 100644 --- a/sm4/aesni_arm64.h +++ b/sm4/aesni_arm64.h @@ -82,22 +82,15 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 // t1 = t3.S1, t2.S1, t1.S1, t0.S1 // t2 = t3.S2, t2.S2, t1.S2, t0.S2 // t3 = t3.S3, t2.S3, t1.S3, t0.S3 -#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ - VMOV t0.B16, K.B16 \ - VMOV t1.S[0], t0.S[1] \ - VMOV t2.S[0], t0.S[2] \ - VMOV t3.S[0], t0.S[3] \ - VMOV K.S[1], t1.S[0] \ - VMOV K.S[2], t2.S[0] \ - VMOV K.S[3], t3.S[0] \ - VMOV t1.D[1], K.D[1] \ - VMOV t2.S[1], t1.S[2] \ - VMOV t3.S[1], t1.S[3] \ - VMOV K.S[2], t2.S[1] \ - VMOV K.S[3], t3.S[1] \ - VMOV t2.S[3], K.S[3] \ - VMOV t3.S[2], t2.S[3] \ - VMOV K.S[3], t3.S[2] +#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \ + VZIP1 t1.S4, t0.S4, RTMP0.S4 \ + VZIP1 t3.S4, t2.S4, RTMP1.S4 \ + VZIP2 t1.S4, t0.S4, RTMP2.S4 \ + VZIP2 t3.S4, t2.S4, RTMP3.S4 \ + VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \ + VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \ + VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \ + VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 // input: from high to low // t0 = t0.S3, t0.S2, t0.S1, t0.S0 @@ -109,25 +102,15 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 // t1 = t0.S1, t1.S1, t2.S1, t3.S1 // t2 = t0.S2, t1.S2, t2.S2, t3.S2 // t3 = t0.S3, t1.S3, t2.S3, t3.S3 -#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \ - VMOV t0.B16, K.B16 \ - VMOV t3.S[0], t0.S[0] \ - VMOV t2.S[0], t0.S[1] \ - VMOV t1.S[0], t0.S[2] \ - VMOV K0.S[0], t0.S[3] \ - VMOV t3.S[1], t1.S[0] \ - VMOV t3.S[2], t2.S[0] \ - VMOV t3.S[3], t3.S[0] \ - VMOV t2.S[3], t3.S[1] \ - VMOV t1.S[3], t3.S[2] \ - VMOV K.S[3], t3.S[3] \ - VMOV K.S[2], t2.S[3] \ - VMOV K.S[1], t1.S[3] \ - VMOV t1.B16, K.B16 \ - VMOV t2.S[1], t1.S[1] \ - VMOV K.S[1], t1.S[2] \ - VMOV t2.S[2], t2.S[1] \ - VMOV K.S[2], t2.S[2] +#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \ + VZIP1 t0.S4, t1.S4, RTMP0.S4 \ + VZIP2 t0.S4, t1.S4, RTMP1.S4 \ + VZIP1 t2.S4, t3.S4, RTMP2.S4 \ + VZIP2 t2.S4, t3.S4, RTMP3.S4 \ + VZIP1 RTMP0.D2, RTMP2.D2, t0.D2 \ + VZIP2 RTMP0.D2, RTMP2.D2, t1.D2 \ + VZIP1 RTMP1.D2, RTMP3.D2, t2.D2 \ + VZIP2 RTMP1.D2, RTMP3.D2, t3.D2 // SM4 sbox function // parameters: diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 8b7356d..9a04ba5 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -448,7 +448,7 @@ encOctetsLoop: VADD B7.S4, INC.S4, CTR.S4 // encryption first 4 blocks - PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk @@ -465,9 +465,9 @@ encOctetsEnc4Blocks1: VREV32 B1.B16, B1.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 - TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) // encryption second 4 blocks - PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) + PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) MOVD rkSave, rk encOctetsEnc4Blocks2: @@ -483,7 +483,7 @@ encOctetsEnc4Blocks2: VREV32 B5.B16, B5.B16 VREV32 B6.B16, B6.B16 VREV32 B7.B16, B7.B16 - TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) + TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) // XOR plaintext and store ciphertext VLD1.P 32(srcPtr), [T1.B16, T2.B16] @@ -543,7 +543,7 @@ encNibblesLoop: VADD B3.S4, INC.S4, CTR.S4 // encryption first 4 blocks - PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk @@ -560,7 +560,7 @@ encNibblesEnc4Blocks: VREV32 B1.B16, B1.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 - TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) // XOR plaintext and store ciphertext VLD1.P 32(srcPtr), [K1.B16, K2.B16] @@ -588,7 +588,7 @@ encStartSingles: VADD B3.S4, INC.S4, CTR.S4 // encryption first 4 blocks - PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk @@ -605,7 +605,7 @@ encSinglesEnc4Blocks: VREV32 B1.B16, B1.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 - TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) VMOV B0.B16, K0.B16 CMP $16, srcPtrLen @@ -740,7 +740,7 @@ decOctetsLoop: VADD B7.S4, INC.S4, CTR.S4 // encryption first 4 blocks - PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk @@ -757,10 +757,10 @@ decOctetsEnc4Blocks1: VREV32 B1.B16, T2.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 - TRANSPOSE_MATRIX(T1, T2, B2, B3, K0) + TRANSPOSE_MATRIX(T1, T2, B2, B3, K0, K1, K2, K3) // encryption second 4 blocks - PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) + PRE_TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) MOVD rkSave, rk decOctetsEnc4Blocks2: @@ -776,7 +776,7 @@ decOctetsEnc4Blocks2: VREV32 B5.B16, B5.B16 VREV32 B6.B16, B6.B16 VREV32 B7.B16, B7.B16 - TRANSPOSE_MATRIX(B4, B5, B6, B7, K0) + TRANSPOSE_MATRIX(B4, B5, B6, B7, K0, K1, K2, K3) VLD1.P 32(srcPtr), [B0.B16, B1.B16] VEOR B0.B16, T1.B16, T1.B16 @@ -837,7 +837,7 @@ decNibblesLoop: VADD B3.S4, INC.S4, CTR.S4 // encryption first 4 blocks - PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk @@ -854,7 +854,7 @@ decNibblesEnc4Blocks: VREV32 B1.B16, B1.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 - TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) // XOR plaintext and store ciphertext VLD1.P 32(srcPtr), [K1.B16, K2.B16] @@ -885,7 +885,7 @@ decStartSingles: VADD B3.S4, INC.S4, CTR.S4 // encryption first 4 blocks - PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + PRE_TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) EOR R13, R13 MOVD rkSave, rk @@ -902,7 +902,7 @@ decSinglesEnc4Blocks: VREV32 B1.B16, B1.B16 VREV32 B2.B16, B2.B16 VREV32 B3.B16, B3.B16 - TRANSPOSE_MATRIX(B0, B1, B2, B3, K0) + TRANSPOSE_MATRIX(B0, B1, B2, B3, K0, K1, K2, K3) VMOV B0.B16, K0.B16 CMP $16, srcPtrLen