mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 20:26:19 +08:00
sm4: reduce duplicated code and add comments
This commit is contained in:
parent
9204f1f4b2
commit
9b364dca8b
@ -53,6 +53,23 @@ DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), RODATA, $16
|
||||
|
||||
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
|
||||
// input: from high to low
|
||||
// r0 = [w3, w2, w1, w0]
|
||||
// r1 = [w7, w6, w5, w4]
|
||||
// r2 = [w11, w10, w9, w8]
|
||||
// r3 = [w15, w14, w13, w12]
|
||||
// r: 32/64 temp register
|
||||
// tmp1: 128 bits temp register
|
||||
// tmp2: 128 bits temp register
|
||||
//
|
||||
// output: from high to low
|
||||
// r0 = [w12, w8, w4, w0]
|
||||
// r1 = [w13, w9, w5, w1]
|
||||
// r2 = [w14, w10, w6, w2]
|
||||
// r3 = [w15, w11, w7, w3]
|
||||
//
|
||||
// SSE2/MMX instructions:
|
||||
// MOVOU r0, tmp2;
|
||||
// PUNPCKHDQ r1, tmp2;
|
||||
// PUNPCKLDQ r1, r0;
|
||||
@ -122,6 +139,11 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
PEXTRQ $0, tmp2, r; \
|
||||
PINSRQ $0, r, r2
|
||||
|
||||
// SM4 sbox function
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \ //############################# inner affine ############################//
|
||||
MOVOU x, z; \
|
||||
@ -149,6 +171,11 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x)
|
||||
PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y;
|
||||
|
||||
// SM4 TAO L1 function
|
||||
// parameters:
|
||||
// - x: 128 bits register as TAO_L1 input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
; \ //#################### 4 parallel L1 linear transforms ##################//
|
||||
@ -167,6 +194,56 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
PXOR y, x; \ //x = x xor y
|
||||
PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24);
|
||||
|
||||
// SM4 single round function, handle 16 bytes data
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// used R19 as temp 32/64 bits register
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - RK: round key register
|
||||
// - IND: round key index base register
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
// SM4 round function, handle 64 bytes data
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - RK: round key register
|
||||
// - IND: round key index base register
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
// SM4 sbox function, AVX version
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - X_NIBBLE_MASK: 128 bits register stored nibble mask, should be loaded earlier.
|
||||
// - tmp: 128 bits temp register
|
||||
#define AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp) \
|
||||
VPAND X_NIBBLE_MASK, x, tmp; \
|
||||
VMOVDQU m1_low<>(SB), y; \
|
||||
@ -188,8 +265,14 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp) \
|
||||
AVX_SM4_SBOX(x, y, X_NIBBLE_MASK, tmp); \
|
||||
// SM4 TAO L1 function, AVX version
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
|
||||
// - tmp: 128 bits temp register
|
||||
#define AVX_SM4_TAO_L1(x, y, xNibbleMask, tmp) \
|
||||
AVX_SM4_SBOX(x, y, xNibbleMask, tmp); \
|
||||
VMOVDQU r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
@ -204,6 +287,14 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
|
||||
// transpose matrix function, AVX/AVX2 version
|
||||
// parameters:
|
||||
// - r0: 128/256 bits register as input/output data
|
||||
// - r1: 128/256 bits register as input/output data
|
||||
// - r2: 128/256 bits register as input/output data
|
||||
// - r3: 128/256 bits register as input/output data
|
||||
// - tmp1: 128/256 bits temp register
|
||||
// - tmp2: 128/256 bits temp register
|
||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
@ -214,42 +305,60 @@ GLOBL fk_mask<>(SB), RODATA, $16
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
|
||||
#define AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
|
||||
VPAND yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m1_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m1_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST xNibbleMask, xw, xw; \
|
||||
VAESENCLAST xNibbleMask, yw, yw; \
|
||||
VINSERTI128 $1, yw, x, x; \
|
||||
VPANDN yNibbleMask, x, tmp; \
|
||||
VBROADCASTI128 m2_low<>(SB), y; \
|
||||
VPSHUFB tmp, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m2_high<>(SB), tmp; \
|
||||
VPSHUFB x, tmp, x; \
|
||||
// SM4 sbox function, AVX2 version
|
||||
// parameters:
|
||||
// - x: 256 bits register as sbox input/output data
|
||||
// - y: 256 bits temp register
|
||||
// - z: 256 bits temp register
|
||||
// - xw: 128 bits temp register
|
||||
// - yw: 128 bits temp register
|
||||
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
|
||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||
#define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||
VPAND yNibbleMask, x, z; \
|
||||
VBROADCASTI128 m1_low<>(SB), y; \
|
||||
VPSHUFB z, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m1_high<>(SB), z; \
|
||||
VPSHUFB x, z, x; \
|
||||
VPXOR y, x, x; \
|
||||
VBROADCASTI128 inverse_shift_rows<>(SB), z; \
|
||||
VPSHUFB z, x, x; \
|
||||
VEXTRACTI128 $1, x, yw \
|
||||
VAESENCLAST xNibbleMask, xw, xw; \
|
||||
VAESENCLAST xNibbleMask, yw, yw; \
|
||||
VINSERTI128 $1, yw, x, x; \
|
||||
VPANDN yNibbleMask, x, z; \
|
||||
VBROADCASTI128 m2_low<>(SB), y; \
|
||||
VPSHUFB z, y, y; \
|
||||
VPSRLQ $4, x, x; \
|
||||
VPAND yNibbleMask, x, x; \
|
||||
VBROADCASTI128 m2_high<>(SB), z; \
|
||||
VPSHUFB x, z, x; \
|
||||
VPXOR y, x, x
|
||||
|
||||
#define AVX2_SM4_TAO_L1(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp) \
|
||||
AVX2_SM4_SBOX(x, y, xw, yw, xNibbleMask, yNibbleMask, tmp); \
|
||||
VBROADCASTI128 r08_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR tmp, y, y; \
|
||||
VPSLLD $2, y, tmp; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR tmp, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), tmp; \
|
||||
VPSHUFB tmp, x, tmp; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, tmp, x
|
||||
// SM4 TAO L1 function, AVX2 version
|
||||
// parameters:
|
||||
// - x: 256 bits register as sbox input/output data
|
||||
// - y: 256 bits temp register
|
||||
// - z: 256 bits temp register
|
||||
// - xw: 128 bits temp register
|
||||
// - yw: 128 bits temp register
|
||||
// - xNibbleMask: 128 bits register stored nibble mask, should be loaded earlier.
|
||||
// - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier.
|
||||
#define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \
|
||||
AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \
|
||||
VBROADCASTI128 r08_mask<>(SB), z; \
|
||||
VPSHUFB z, x, y; \
|
||||
VPXOR x, y, y; \
|
||||
VBROADCASTI128 r16_mask<>(SB), z; \
|
||||
VPSHUFB z, x, z; \
|
||||
VPXOR z, y, y; \
|
||||
VPSLLD $2, y, z; \
|
||||
VPSRLD $30, y, y; \
|
||||
VPXOR z, y, y; \
|
||||
VBROADCASTI128 r24_mask<>(SB), z; \
|
||||
VPSHUFB z, x, z; \
|
||||
VPXOR y, x, x; \
|
||||
VPXOR x, z, x
|
||||
|
@ -43,6 +43,45 @@ DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
|
||||
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
|
||||
GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
|
||||
#define LOAD_SM4_AESNI_CONSTS() \
|
||||
LDP nibble_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, NIBBLE_MASK.D[0] \
|
||||
VMOV R21, NIBBLE_MASK.D[1] \
|
||||
LDP m1_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M1L.D[0] \
|
||||
VMOV R21, M1L.D[1] \
|
||||
LDP m1_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M1H.D[0] \
|
||||
VMOV R21, M1H.D[1] \
|
||||
LDP m2_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M2L.D[0] \
|
||||
VMOV R21, M2L.D[1] \
|
||||
LDP m2_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M2H.D[0] \
|
||||
VMOV R21, M2H.D[1] \
|
||||
LDP inverse_shift_rows<>(SB), (R20, R21) \
|
||||
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
|
||||
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
||||
LDP r08_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R08_MASK.D[0] \
|
||||
VMOV R21, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R16_MASK.D[0] \
|
||||
VMOV R21, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R24_MASK.D[0] \
|
||||
VMOV R21, R24_MASK.D[1]
|
||||
|
||||
// input: from high to low
|
||||
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
||||
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
|
||||
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
|
||||
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
|
||||
// output: from high to low
|
||||
// t0 = t3.S0, t2.S0, t1.S0, t0.S0
|
||||
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
|
||||
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
|
||||
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
|
||||
#define PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t1.S[0], t0.S[1] \
|
||||
@ -60,6 +99,16 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
VMOV t3.S[2], t2.S[3] \
|
||||
VMOV K.S[3], t3.S[2]
|
||||
|
||||
// input: from high to low
|
||||
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
||||
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
|
||||
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
|
||||
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
|
||||
// output: from high to low
|
||||
// t0 = t0.S0, t1.S0, t2.S0, t3.S0
|
||||
// t1 = t0.S1, t1.S1, t2.S1, t3.S1
|
||||
// t2 = t0.S2, t1.S2, t2.S2, t3.S2
|
||||
// t3 = t0.S3, t1.S3, t2.S3, t3.S3
|
||||
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, K) \
|
||||
VMOV t0.B16, K.B16 \
|
||||
VMOV t3.S[0], t0.S[0] \
|
||||
@ -80,6 +129,11 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
VMOV t2.S[2], t2.S[1] \
|
||||
VMOV K.S[2], t2.S[2]
|
||||
|
||||
// SM4 sbox function
|
||||
// parameters:
|
||||
// - x: 128 bits register as sbox input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
#define SM4_SBOX(x, y, z) \
|
||||
; \
|
||||
VAND x.B16, NIBBLE_MASK.B16, z.B16; \
|
||||
@ -97,6 +151,11 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
VTBL z.B16, [M2H.B16], z.B16; \
|
||||
VEOR y.B16, z.B16, x.B16
|
||||
|
||||
// SM4 TAO L1 function
|
||||
// parameters:
|
||||
// - x: 128 bits register as TAO_L1 input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
#define SM4_TAO_L1(x, y, z) \
|
||||
SM4_SBOX(x, y, z); \
|
||||
VTBL R08_MASK.B16, [x.B16], y.B16; \
|
||||
@ -109,3 +168,24 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16
|
||||
VTBL R24_MASK.B16, [x.B16], z.B16; \
|
||||
VEOR z.B16, x.B16, x.B16; \
|
||||
VEOR y.B16, x.B16, x.B16
|
||||
|
||||
// SM4 round function
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - RK: round key register
|
||||
// - tmp32: temp 32/64 bits register
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - z: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_ROUND(RK, tmp32, x, y, z, t0, t1, t2, t3) \
|
||||
MOVW.P 4(RK), tmp32; \
|
||||
VMOV tmp32, x.S4; \
|
||||
VEOR t1.B16, x.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
@ -16,6 +16,12 @@
|
||||
|
||||
#include "aesni_amd64.h"
|
||||
|
||||
// SM4 TAO L2 function, used for key expand
|
||||
// parameters:
|
||||
// - x: 128 bits register as TAO_L1 input/output data
|
||||
// - y: 128 bits temp register
|
||||
// - tmp1: 128 bits temp register
|
||||
// - tmp2: 128 bits temp register
|
||||
#define SM4_TAO_L2(x, y, tmp1, tmp2) \
|
||||
SM4_SBOX(x, y, tmp1); \
|
||||
; \ //#################### 4 parallel L2 linear transforms ##################//
|
||||
@ -31,23 +37,16 @@
|
||||
PXOR tmp2, y; \
|
||||
PXOR y, x
|
||||
|
||||
#define SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(AX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
PXOR x, t0
|
||||
|
||||
// SM4 expand round function
|
||||
// t0 ^= tao_l2(t1^t2^t3^ck) and store t0.S[0] to enc/dec
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - t0: 128 bits register for data
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define SM4_EXPANDKEY_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(BX)(CX*1), x; \
|
||||
PXOR t1, x; \
|
||||
@ -89,14 +88,34 @@
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
// SM4 round function, AVX2 version, handle 256 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 256 bits temp register
|
||||
// - y: 256 bits temp register
|
||||
// - t0: 256 bits register for data as result
|
||||
// - t1: 256 bits register for data
|
||||
// - t2: 256 bits register for data
|
||||
// - t3: 256 bits register for data
|
||||
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK, XDWTMP0); \
|
||||
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
// SM4 round function, AVX version, handle 128 bits
|
||||
// t0 ^= tao_l1(t1^t2^t3^xk)
|
||||
// parameters:
|
||||
// - index: round key index immediate number
|
||||
// - x: 128 bits temp register
|
||||
// - y: 128 bits temp register
|
||||
// - t0: 128 bits register for data as result
|
||||
// - t1: 128 bits register for data
|
||||
// - t2: 128 bits register for data
|
||||
// - t3: 128 bits register for data
|
||||
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
|
||||
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
|
||||
VPXOR t1, x, x; \
|
||||
@ -174,10 +193,10 @@ non_avx2_start:
|
||||
XORL CX, CX
|
||||
|
||||
loop:
|
||||
SM4_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
@ -328,10 +347,10 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||
XORL CX, CX
|
||||
|
||||
loop:
|
||||
SM4_SINGLE_ROUND(0, x, y, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, x, y, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, x, y, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, x, y, t3, t0, t1, t2)
|
||||
SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADDL $16, CX
|
||||
CMPL CX, $4*32
|
||||
|
@ -37,15 +37,6 @@
|
||||
VEOR XTMP7.B16, y.B16, y.B16; \
|
||||
VEOR x.B16, y.B16, x.B16
|
||||
|
||||
#define SM4_ROUND(RK, x, y, t0, t1, t2, t3) \
|
||||
MOVW.P 4(RK), R19; \
|
||||
VMOV R19, x.S4; \
|
||||
VEOR t1.B16, x.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, XTMP6); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
||||
#define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \
|
||||
MOVW.P 4(R9), R19; \
|
||||
VMOV R19, x.S[0]; \
|
||||
@ -226,10 +217,10 @@ TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0
|
||||
EOR R0, R0
|
||||
|
||||
encryptBlocksLoop:
|
||||
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
@ -297,10 +288,10 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
|
||||
EOR R0, R0
|
||||
|
||||
encryptBlockLoop:
|
||||
SM4_ROUND(R8, x, y, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, x, y, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, x, y, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, x, y, t3, t0, t1, t2)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
|
@ -15,14 +15,6 @@
|
||||
|
||||
#include "aesni_amd64.h"
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
#define ctx BX
|
||||
|
@ -25,50 +25,6 @@
|
||||
|
||||
#include "aesni_arm64.h"
|
||||
|
||||
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
|
||||
MOVW.P 4(RK), R19; \
|
||||
VMOV R19, x.S4; \
|
||||
VEOR t1.B16, x.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
||||
#define load_global_data_1() \
|
||||
LDP nibble_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, NIBBLE_MASK.D[0] \
|
||||
VMOV R1, NIBBLE_MASK.D[1] \
|
||||
LDP m1_low<>(SB), (R0, R1) \
|
||||
VMOV R0, M1L.D[0] \
|
||||
VMOV R1, M1L.D[1] \
|
||||
LDP m1_high<>(SB), (R0, R1) \
|
||||
VMOV R0, M1H.D[0] \
|
||||
VMOV R1, M1H.D[1] \
|
||||
LDP m2_low<>(SB), (R0, R1) \
|
||||
VMOV R0, M2L.D[0] \
|
||||
VMOV R1, M2L.D[1] \
|
||||
LDP m2_high<>(SB), (R0, R1) \
|
||||
VMOV R0, M2H.D[0] \
|
||||
VMOV R1, M2H.D[1] \
|
||||
LDP fk_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, FK_MASK.D[0] \
|
||||
VMOV R1, FK_MASK.D[1] \
|
||||
LDP inverse_shift_rows<>(SB), (R0, R1) \
|
||||
VMOV R0, INVERSE_SHIFT_ROWS.D[0] \
|
||||
VMOV R1, INVERSE_SHIFT_ROWS.D[1]
|
||||
|
||||
#define load_global_data_2() \
|
||||
load_global_data_1() \
|
||||
LDP r08_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, R08_MASK.D[0] \
|
||||
VMOV R1, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, R16_MASK.D[0] \
|
||||
VMOV R1, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R0, R1) \
|
||||
VMOV R0, R24_MASK.D[0] \
|
||||
VMOV R1, R24_MASK.D[1]
|
||||
|
||||
// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
#define ctx R1
|
||||
@ -76,55 +32,55 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0
|
||||
#define ptxLen R4
|
||||
#define rkSave R8
|
||||
|
||||
load_global_data_2()
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
|
||||
MOVD xk+0(FP), rkSave
|
||||
MOVD dst+8(FP), ctx
|
||||
MOVD src+32(FP), ptx
|
||||
MOVD src_len+40(FP), ptxLen
|
||||
MOVD iv+56(FP), R5
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
VLD1 (R5), [IV.B16]
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
VLD1 (R5), [IV.B16]
|
||||
|
||||
loopSrc:
|
||||
CMP $16, ptxLen
|
||||
BLT done_sm4
|
||||
SUB $16, ptxLen
|
||||
CMP $16, ptxLen
|
||||
BLT done_sm4
|
||||
SUB $16, ptxLen
|
||||
|
||||
VLD1.P (ptx), [t0.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VREV32 t0.B16, t0.B16
|
||||
VMOV t0.S[1], t1.S[0]
|
||||
VMOV t0.S[2], t2.S[0]
|
||||
VMOV t0.S[3], t3.S[0]
|
||||
VLD1.P (ptx), [t0.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VREV32 t0.B16, t0.B16
|
||||
VMOV t0.S[1], t1.S[0]
|
||||
VMOV t0.S[2], t2.S[0]
|
||||
VMOV t0.S[3], t3.S[0]
|
||||
|
||||
EOR R2, R2
|
||||
MOVD rkSave, R0
|
||||
EOR R2, R2
|
||||
MOVD rkSave, R0
|
||||
|
||||
encryptBlockLoop:
|
||||
SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R0, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R2
|
||||
CMP $128, R2
|
||||
BNE encryptBlockLoop
|
||||
ADD $16, R2
|
||||
CMP $128, R2
|
||||
BNE encryptBlockLoop
|
||||
|
||||
VMOV t2.S[0], t3.S[1]
|
||||
VMOV t1.S[0], t3.S[2]
|
||||
VMOV t0.S[0], t3.S[3]
|
||||
VREV32 t3.B16, t3.B16
|
||||
VMOV t2.S[0], t3.S[1]
|
||||
VMOV t1.S[0], t3.S[2]
|
||||
VMOV t0.S[0], t3.S[3]
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VST1.P [t3.B16], (ctx)
|
||||
VMOV t3.B16, IV.B16
|
||||
VST1.P [t3.B16], (ctx)
|
||||
VMOV t3.B16, IV.B16
|
||||
|
||||
B loopSrc
|
||||
B loopSrc
|
||||
|
||||
done_sm4:
|
||||
VST1 [IV.B16], (R5)
|
||||
RET
|
||||
VST1 [IV.B16], (R5)
|
||||
RET
|
||||
|
||||
#undef ctx
|
||||
#undef ptx
|
||||
|
@ -155,23 +155,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
#undef plen
|
||||
#undef dlen
|
||||
|
||||
#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PINSRD $0, (index * 4)(RK)(IND*1), x; \
|
||||
PSHUFD $0, x, x; \
|
||||
PXOR t1, x; \
|
||||
PXOR t2, x; \
|
||||
PXOR t3, x; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
PXOR x, t0
|
||||
|
||||
#define SM4_4BLOCKS(RK, IND, x, y, z, t0, t1, t2, t3) \
|
||||
PSHUFB flip_mask<>(SB), t0; \
|
||||
PSHUFB flip_mask<>(SB), t1; \
|
||||
@ -229,7 +212,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
VPXOR t1, x, x; \
|
||||
VPXOR t2, x, x; \
|
||||
VPXOR t3, x, x; \
|
||||
AVX2_SM4_TAO_L1(x, y, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK, tmp); \
|
||||
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
|
||||
VPXOR x, t0, t0
|
||||
|
||||
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
|
||||
|
110
sm4/gcm_arm64.s
110
sm4/gcm_arm64.s
@ -111,44 +111,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
|
||||
|
||||
#include "aesni_arm64.h"
|
||||
|
||||
#define LOAD_SM4_AESNI_CONSTS() \
|
||||
LDP nibble_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, NIBBLE_MASK.D[0] \
|
||||
VMOV R21, NIBBLE_MASK.D[1] \
|
||||
LDP m1_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M1L.D[0] \
|
||||
VMOV R21, M1L.D[1] \
|
||||
LDP m1_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M1H.D[0] \
|
||||
VMOV R21, M1H.D[1] \
|
||||
LDP m2_low<>(SB), (R20, R21) \
|
||||
VMOV R20, M2L.D[0] \
|
||||
VMOV R21, M2L.D[1] \
|
||||
LDP m2_high<>(SB), (R20, R21) \
|
||||
VMOV R20, M2H.D[0] \
|
||||
VMOV R21, M2H.D[1] \
|
||||
LDP inverse_shift_rows<>(SB), (R20, R21) \
|
||||
VMOV R20, INVERSE_SHIFT_ROWS.D[0] \
|
||||
VMOV R21, INVERSE_SHIFT_ROWS.D[1] \
|
||||
LDP r08_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R08_MASK.D[0] \
|
||||
VMOV R21, R08_MASK.D[1] \
|
||||
LDP r16_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R16_MASK.D[0] \
|
||||
VMOV R21, R16_MASK.D[1] \
|
||||
LDP r24_mask<>(SB), (R20, R21) \
|
||||
VMOV R20, R24_MASK.D[0] \
|
||||
VMOV R21, R24_MASK.D[1]
|
||||
|
||||
#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \
|
||||
MOVW.P 4(RK), R19; \
|
||||
VMOV R19, x.S4; \
|
||||
VEOR t1.B16, x.B16, x.B16; \
|
||||
VEOR t2.B16, x.B16, x.B16; \
|
||||
VEOR t3.B16, x.B16, x.B16; \
|
||||
SM4_TAO_L1(x, y, z); \
|
||||
VEOR x.B16, t0.B16, t0.B16
|
||||
|
||||
// func gcmSm4Init(productTable *[256]byte, rk []uint32, inst int)
|
||||
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
#define pTbl R0
|
||||
@ -178,10 +140,10 @@ TEXT ·gcmSm4Init(SB),NOSPLIT,$0
|
||||
EOR R3, R3
|
||||
|
||||
sm4InitEncLoop:
|
||||
SM4_ROUND(RK, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(RK, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(RK, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(RK, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(RK, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(RK, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(RK, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(RK, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R3
|
||||
CMP $8, R3
|
||||
@ -491,10 +453,10 @@ encOctetsLoop:
|
||||
MOVD rkSave, rk
|
||||
|
||||
encOctetsEnc4Blocks1:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
@ -509,10 +471,10 @@ encOctetsEnc4Blocks1:
|
||||
MOVD rkSave, rk
|
||||
|
||||
encOctetsEnc4Blocks2:
|
||||
SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $16, R13
|
||||
@ -586,10 +548,10 @@ encNibblesLoop:
|
||||
MOVD rkSave, rk
|
||||
|
||||
encNibblesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
@ -631,10 +593,10 @@ encStartSingles:
|
||||
MOVD rkSave, rk
|
||||
|
||||
encSinglesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
@ -783,10 +745,10 @@ decOctetsLoop:
|
||||
MOVD rkSave, rk
|
||||
|
||||
decOctetsEnc4Blocks1:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
@ -802,10 +764,10 @@ decOctetsEnc4Blocks1:
|
||||
MOVD rkSave, rk
|
||||
|
||||
decOctetsEnc4Blocks2:
|
||||
SM4_ROUND(rk, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, K0, K1, K2, B7, B4, B5, B6)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B4, B5, B6, B7)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B5, B6, B7, B4)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B6, B7, B4, B5)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B7, B4, B5, B6)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $16, R13
|
||||
@ -880,10 +842,10 @@ decNibblesLoop:
|
||||
MOVD rkSave, rk
|
||||
|
||||
decNibblesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
@ -928,10 +890,10 @@ decStartSingles:
|
||||
MOVD rkSave, rk
|
||||
|
||||
decSinglesEnc4Blocks:
|
||||
SM4_ROUND(rk, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, K0, K1, K2, B3, B0, B1, B2)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B0, B1, B2, B3)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B1, B2, B3, B0)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B2, B3, B0, B1)
|
||||
SM4_ROUND(rk, R19, K0, K1, K2, B3, B0, B1, B2)
|
||||
|
||||
ADD $1, R13
|
||||
CMP $8, R13
|
||||
|
Loading…
x
Reference in New Issue
Block a user