mirror of
https://github.com/emmansun/gmsm.git
synced 2025-05-14 04:56:21 +08:00
sm4: optimize TAO L1 #168
This commit is contained in:
parent
cc441bed27
commit
53e121c2b5
@ -41,14 +41,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
|||||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
GLOBL r08_mask<>(SB), 8, $16
|
GLOBL r08_mask<>(SB), 8, $16
|
||||||
|
|
||||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
|
||||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
|
||||||
GLOBL r16_mask<>(SB), 8, $16
|
|
||||||
|
|
||||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
|
||||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
|
||||||
GLOBL r24_mask<>(SB), 8, $16
|
|
||||||
|
|
||||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||||
GLOBL fk_mask<>(SB), 8, $16
|
GLOBL fk_mask<>(SB), 8, $16
|
||||||
@ -66,18 +58,6 @@ DATA r08_mask256<>+0x10(SB)/8, $0x0605040702010003
|
|||||||
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
DATA r08_mask256<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
GLOBL r08_mask256<>(SB), 8, $32
|
GLOBL r08_mask256<>(SB), 8, $32
|
||||||
|
|
||||||
DATA r16_mask256<>+0x00(SB)/8, $0x0504070601000302
|
|
||||||
DATA r16_mask256<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
|
||||||
DATA r16_mask256<>+0x10(SB)/8, $0x0504070601000302
|
|
||||||
DATA r16_mask256<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
|
||||||
GLOBL r16_mask256<>(SB), 8, $32
|
|
||||||
|
|
||||||
DATA r24_mask256<>+0x00(SB)/8, $0x0407060500030201
|
|
||||||
DATA r24_mask256<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
|
||||||
DATA r24_mask256<>+0x10(SB)/8, $0x0407060500030201
|
|
||||||
DATA r24_mask256<>+0x18(SB)/8, $0x0C0F0E0D080B0A09
|
|
||||||
GLOBL r24_mask256<>(SB), 8, $32
|
|
||||||
|
|
||||||
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
||||||
// input: from high to low
|
// input: from high to low
|
||||||
// r0 = [w3, w2, w1, w0]
|
// r0 = [w3, w2, w1, w0]
|
||||||
@ -164,19 +144,18 @@ GLOBL r24_mask256<>(SB), 8, $32
|
|||||||
SM4_SBOX(x, y, z); \
|
SM4_SBOX(x, y, z); \
|
||||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||||
MOVOU x, y; \
|
MOVOU x, y; \
|
||||||
PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08)
|
PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8
|
||||||
PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08)
|
MOVOU y, z; \
|
||||||
MOVOU x, z; \
|
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16
|
||||||
PSHUFB r16_mask<>(SB), z; \
|
PXOR x, y; \ //y = x ^ (x <<< 8)
|
||||||
PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16)
|
PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
|
PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24
|
||||||
|
PXOR z, x; \ //x = x ^ (x <<< 24)
|
||||||
MOVOU y, z; \
|
MOVOU y, z; \
|
||||||
PSLLL $2, z; \
|
PSLLL $2, z; \
|
||||||
PSRLL $30, y; \
|
PSRLL $30, y; \
|
||||||
POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30);
|
POR z, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||||
MOVOU x, z; \
|
PXOR y, x
|
||||||
PSHUFB r24_mask<>(SB), z; \
|
|
||||||
PXOR y, x; \ //x = x xor y
|
|
||||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
|
||||||
|
|
||||||
// SM4 single round function, handle 16 bytes data
|
// SM4 single round function, handle 16 bytes data
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
@ -239,6 +218,7 @@ GLOBL r24_mask256<>(SB), 8, $32
|
|||||||
PSHUFD $0xFF, rk128, x; \
|
PSHUFD $0xFF, rk128, x; \
|
||||||
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
SM4_ONE_ROUND_SSE(x, y, z, t3, t0, t1, t2); \
|
||||||
|
|
||||||
|
// Requires: SSSE3
|
||||||
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
#define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \
|
||||||
PSHUFB flip_mask<>(SB), t0; \
|
PSHUFB flip_mask<>(SB), t0; \
|
||||||
PSHUFD $1, t0, t1; \
|
PSHUFD $1, t0, t1; \
|
||||||
@ -388,16 +368,16 @@ GLOBL r24_mask256<>(SB), 8, $32
|
|||||||
// - tmp: 128 bits temp register
|
// - tmp: 128 bits temp register
|
||||||
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
#define AVX_SM4_TAO_L1(x, y, tmp) \
|
||||||
AVX_SM4_SBOX(x, y, tmp); \
|
AVX_SM4_SBOX(x, y, tmp); \
|
||||||
VPSHUFB r08_mask<>(SB), x, y; \
|
VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8
|
||||||
VPXOR x, y, y; \
|
VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16
|
||||||
VPSHUFB r16_mask<>(SB), x, tmp; \
|
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||||
VPXOR tmp, y, y; \
|
VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
|
VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24
|
||||||
|
VPXOR x, tmp, x; \ // x = x ^ (x <<< 24)
|
||||||
VPSLLD $2, y, tmp; \
|
VPSLLD $2, y, tmp; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
VPOR tmp, y, y; \
|
VPOR tmp, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||||
VPSHUFB r24_mask<>(SB), x, tmp; \
|
VPXOR y, x, x
|
||||||
VPXOR y, x, x; \
|
|
||||||
VPXOR x, tmp, x
|
|
||||||
|
|
||||||
// transpose matrix function, AVX/AVX2 version
|
// transpose matrix function, AVX/AVX2 version
|
||||||
// parameters:
|
// parameters:
|
||||||
@ -433,7 +413,7 @@ GLOBL r24_mask256<>(SB), 8, $32
|
|||||||
VPXOR t1, x, x; \
|
VPXOR t1, x, x; \
|
||||||
VPXOR t2, x, x; \
|
VPXOR t2, x, x; \
|
||||||
VPXOR t3, x, x; \
|
VPXOR t3, x, x; \
|
||||||
AVX_SM4_TAO_L1(x, y, tmp); \
|
AVX_SM4_TAO_L1(x, y, tmp); \
|
||||||
VPXOR x, t0, t0
|
VPXOR x, t0, t0
|
||||||
|
|
||||||
|
|
||||||
@ -591,16 +571,16 @@ GLOBL r24_mask256<>(SB), 8, $32
|
|||||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||||
VPSHUFB r08_mask256<>(SB), x, y; \
|
VPSHUFB r08_mask256<>(SB), x, y; \ // y = x <<< 8
|
||||||
VPXOR x, y, y; \
|
VPSHUFB r08_mask256<>(SB), y, z; \ // z = x <<< 16
|
||||||
VPSHUFB r16_mask256<>(SB), x, z; \
|
VPXOR x, y, y; \ // y = x ^ (x <<< 8)
|
||||||
VPXOR z, y, y; \
|
VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
|
VPSHUFB r08_mask256<>(SB), z, z; \ // z = x <<< 24
|
||||||
|
VPXOR x, z, x; \ // x = x ^ (x <<< 24)
|
||||||
VPSLLD $2, y, z; \
|
VPSLLD $2, y, z; \
|
||||||
VPSRLD $30, y, y; \
|
VPSRLD $30, y, y; \
|
||||||
VPOR z, y, y; \
|
VPOR z, y, y; \ // y = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||||
VPSHUFB r24_mask256<>(SB), x, z; \
|
VPXOR y, x, x
|
||||||
VPXOR y, x, x; \
|
|
||||||
VPXOR x, z, x
|
|
||||||
|
|
||||||
// SM4 round function, AVX2 version, handle 256 bits
|
// SM4 round function, AVX2 version, handle 256 bits
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
@ -31,14 +31,6 @@ DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
|
|||||||
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||||
GLOBL r08_mask<>(SB), (16+8), $16
|
GLOBL r08_mask<>(SB), (16+8), $16
|
||||||
|
|
||||||
DATA r16_mask<>+0x00(SB)/8, $0x0504070601000302
|
|
||||||
DATA r16_mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
|
||||||
GLOBL r16_mask<>(SB), (16+8), $16
|
|
||||||
|
|
||||||
DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201
|
|
||||||
DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
|
|
||||||
GLOBL r24_mask<>(SB), (16+8), $16
|
|
||||||
|
|
||||||
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||||
GLOBL fk_mask<>(SB), (16+8), $16
|
GLOBL fk_mask<>(SB), (16+8), $16
|
||||||
@ -64,13 +56,7 @@ GLOBL fk_mask<>(SB), (16+8), $16
|
|||||||
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
||||||
LDP r08_mask<>(SB), (R20, R21) \
|
LDP r08_mask<>(SB), (R20, R21) \
|
||||||
VMOV R20, R08_MASK.D[0] \
|
VMOV R20, R08_MASK.D[0] \
|
||||||
VMOV R21, R08_MASK.D[1] \
|
VMOV R21, R08_MASK.D[1]
|
||||||
LDP r16_mask<>(SB), (R20, R21) \
|
|
||||||
VMOV R20, R16_MASK.D[0] \
|
|
||||||
VMOV R21, R16_MASK.D[1] \
|
|
||||||
LDP r24_mask<>(SB), (R20, R21) \
|
|
||||||
VMOV R20, R24_MASK.D[0] \
|
|
||||||
VMOV R21, R24_MASK.D[1]
|
|
||||||
|
|
||||||
// input: from high to low
|
// input: from high to low
|
||||||
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
||||||
@ -141,15 +127,15 @@ GLOBL fk_mask<>(SB), (16+8), $16
|
|||||||
// - z: 128 bits temp register
|
// - z: 128 bits temp register
|
||||||
#define SM4_TAO_L1(x, y, z) \
|
#define SM4_TAO_L1(x, y, z) \
|
||||||
SM4_SBOX(x, y, z); \
|
SM4_SBOX(x, y, z); \
|
||||||
VTBL R08_MASK.B16, [x.B16], y.B16; \
|
VTBL R08_MASK.B16, [x.B16], y.B16; \ // y = x <<< 8
|
||||||
VEOR y.B16, x.B16, y.B16; \
|
VTBL R08_MASK.B16, [y.B16], z.B16; \ // z = x <<< 16
|
||||||
VTBL R16_MASK.B16, [x.B16], z.B16; \
|
VEOR x.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8)
|
||||||
VEOR z.B16, y.B16, z.B16; \
|
VEOR z.B16, y.B16, y.B16; \ // y = x ^ (x <<< 8) ^ (x <<< 16)
|
||||||
VSHL $2, z.S4, y.S4; \
|
VTBL R08_MASK.B16, [z.B16], z.B16; \ // z = x <<< 24
|
||||||
VSRI $30, z.S4, y.S4; \
|
VEOR z.B16, x.B16, x.B16; \ // x = x ^ (x <<< 24)
|
||||||
VTBL R24_MASK.B16, [x.B16], z.B16; \
|
VSHL $2, y.S4, z.S4; \
|
||||||
VEOR z.B16, x.B16, x.B16; \
|
VSRI $30, y.S4, z.S4; \ // z = (x <<< 2) ^ (x <<< 10) ^ (x <<< 18)
|
||||||
VEOR y.B16, x.B16, x.B16
|
VEOR z.B16, x.B16, x.B16
|
||||||
|
|
||||||
// SM4 round function
|
// SM4 round function
|
||||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||||
|
@ -322,6 +322,7 @@ avx2_sm4_done:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
// func encryptBlockAsm(xk *uint32, dst, src *byte, inst int)
|
||||||
|
// Requires: SSSE3
|
||||||
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||||
MOVQ xk+0(FP), AX
|
MOVQ xk+0(FP), AX
|
||||||
MOVQ dst+8(FP), BX
|
MOVQ dst+8(FP), BX
|
||||||
|
@ -21,9 +21,7 @@
|
|||||||
#define M2L V24
|
#define M2L V24
|
||||||
#define M2H V25
|
#define M2H V25
|
||||||
#define R08_MASK V26
|
#define R08_MASK V26
|
||||||
#define R16_MASK V27
|
#define FK_MASK V27
|
||||||
#define R24_MASK V28
|
|
||||||
#define FK_MASK V29
|
|
||||||
#define XTMP6 V6
|
#define XTMP6 V6
|
||||||
#define XTMP7 V7
|
#define XTMP7 V7
|
||||||
|
|
||||||
@ -78,13 +76,7 @@
|
|||||||
load_global_data_1() \
|
load_global_data_1() \
|
||||||
LDP r08_mask<>(SB), (R0, R1) \
|
LDP r08_mask<>(SB), (R0, R1) \
|
||||||
VMOV R0, R08_MASK.D[0] \
|
VMOV R0, R08_MASK.D[0] \
|
||||||
VMOV R1, R08_MASK.D[1] \
|
VMOV R1, R08_MASK.D[1]
|
||||||
LDP r16_mask<>(SB), (R0, R1) \
|
|
||||||
VMOV R0, R16_MASK.D[0] \
|
|
||||||
VMOV R1, R16_MASK.D[1] \
|
|
||||||
LDP r24_mask<>(SB), (R0, R1) \
|
|
||||||
VMOV R0, R24_MASK.D[0] \
|
|
||||||
VMOV R1, R24_MASK.D[1]
|
|
||||||
|
|
||||||
#define SM4EKEY_EXPORT_KEYS() \
|
#define SM4EKEY_EXPORT_KEYS() \
|
||||||
VMOV V9.S[3], V10.S[0] \
|
VMOV V9.S[3], V10.S[0] \
|
||||||
|
@ -17,9 +17,7 @@
|
|||||||
#define M2L V24
|
#define M2L V24
|
||||||
#define M2H V25
|
#define M2H V25
|
||||||
#define R08_MASK V26
|
#define R08_MASK V26
|
||||||
#define R16_MASK V27
|
#define FK_MASK V27
|
||||||
#define R24_MASK V28
|
|
||||||
#define FK_MASK V29
|
|
||||||
#define XTMP6 V6
|
#define XTMP6 V6
|
||||||
#define IV V7
|
#define IV V7
|
||||||
|
|
||||||
|
@ -17,9 +17,7 @@
|
|||||||
#define M2L V24
|
#define M2L V24
|
||||||
#define M2H V25
|
#define M2H V25
|
||||||
#define R08_MASK V26
|
#define R08_MASK V26
|
||||||
#define R16_MASK V27
|
#define FK_MASK V27
|
||||||
#define R24_MASK V28
|
|
||||||
#define FK_MASK V29
|
|
||||||
#define XTMP6 V6
|
#define XTMP6 V6
|
||||||
#define XTMP7 V7
|
#define XTMP7 V7
|
||||||
#define t4 V10
|
#define t4 V10
|
||||||
|
@ -37,8 +37,6 @@
|
|||||||
#define M2L V27
|
#define M2L V27
|
||||||
#define M2H V28
|
#define M2H V28
|
||||||
#define R08_MASK V29
|
#define R08_MASK V29
|
||||||
#define R16_MASK V30
|
|
||||||
#define R24_MASK V31
|
|
||||||
|
|
||||||
#define reduce() \
|
#define reduce() \
|
||||||
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
|
||||||
|
@ -37,8 +37,6 @@
|
|||||||
#define M2L V27
|
#define M2L V27
|
||||||
#define M2H V28
|
#define M2H V28
|
||||||
#define R08_MASK V29
|
#define R08_MASK V29
|
||||||
#define R16_MASK V30
|
|
||||||
#define R24_MASK V31
|
|
||||||
|
|
||||||
#include "aesni_macros_arm64.s"
|
#include "aesni_macros_arm64.s"
|
||||||
#include "xts_macros_arm64.s"
|
#include "xts_macros_arm64.s"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user