diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index 9e26b04..b14d356 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -66,179 +66,179 @@ DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 #define SM4_SBOX(x, y) \ - ; \ //############################# inner affine ############################// - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M1L.B16], y.B16; \ - VUSHR $4, x.D2, x.D2; \ - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \ - VEOR y.B16, XTMP7.B16, x.B16; \ - VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ - AESE ZERO.B16, x.B16; \ - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M2L.B16], y.B16; \ - VUSHR $4, x.D2, x.D2; \ - VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ - VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \ - VEOR y.B16, XTMP7.B16, x.B16 + ; \ //############################# inner affine ############################// + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M1L.B16], y.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \ + VEOR y.B16, XTMP7.B16, x.B16; \ + VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ + AESE ZERO.B16, x.B16; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M2L.B16], y.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \ + VEOR y.B16, XTMP7.B16, x.B16 #define SM4_TAO_L1(x, y) \ - SM4_SBOX(x, y); \ - ; \ //#################### 4 parallel L1 linear transforms ##################// - VTBL R08_MASK.B16, [x.B16], y.B16; \ - VEOR y.B16, x.B16, y.B16; \ - VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \ - VEOR XTMP7.B16, y.B16, y.B16; \ - VSHL $2, y.S4, XTMP7.S4; \ - VUSHR $30, y.S4, y.S4; \ - VORR y.B16, XTMP7.B16, y.B16; \ - VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \ - VEOR XTMP7.B16, x.B16, x.B16; \ - VEOR y.B16, x.B16, x.B16 + SM4_SBOX(x, y); \ + ; \ //#################### 4 parallel L1 linear transforms ##################// + VTBL R08_MASK.B16, [x.B16], y.B16; \ + VEOR y.B16, x.B16, y.B16; \ + VTBL R16_MASK.B16, [x.B16], XTMP7.B16; \ + VEOR XTMP7.B16, y.B16, y.B16; \ + VSHL $2, y.S4, XTMP7.S4; \ + VUSHR $30, y.S4, y.S4; \ + VORR y.B16, XTMP7.B16, y.B16; \ + VTBL R24_MASK.B16, [x.B16], XTMP7.B16; \ + VEOR XTMP7.B16, x.B16, x.B16; \ + VEOR y.B16, x.B16, x.B16 #define SM4_TAO_L2(x, y) \ - SM4_SBOX(x, y); \ - ; \ //#################### 4 parallel L2 linear transforms ##################// - VSHL $13, x.S4, XTMP6.S4; \ - VUSHR $19, x.S4, y.S4; \ - VORR XTMP6.B16, y.B16, y.B16; \ - VSHL $23, x.S4, XTMP6.S4; \ - VUSHR $9, x.S4, XTMP7.S4; \ - VORR XTMP6.B16, XTMP7.B16, XTMP7.B16; \ - VEOR XTMP7.B16, y.B16, y.B16; \ - VEOR x.B16, y.B16, x.B16 + SM4_SBOX(x, y); \ + ; \ //#################### 4 parallel L2 linear transforms ##################// + VSHL $13, x.S4, XTMP6.S4; \ + VUSHR $19, x.S4, y.S4; \ + VORR XTMP6.B16, y.B16, y.B16; \ + VSHL $23, x.S4, XTMP6.S4; \ + VUSHR $9, x.S4, XTMP7.S4; \ + VORR XTMP6.B16, XTMP7.B16, XTMP7.B16; \ + VEOR XTMP7.B16, y.B16, y.B16; \ + VEOR x.B16, y.B16, x.B16 #define SM4_ROUND(RK, x, y, t0, t1, t2, t3) \ - MOVW.P 4(RK), R19; \ - VMOV R19, x.S4; \ - VEOR t1.B16, x.B16, x.B16; \ - VEOR t2.B16, x.B16, x.B16; \ - VEOR t3.B16, x.B16, x.B16; \ - SM4_TAO_L1(x, y); \ - VEOR x.B16, t0.B16, t0.B16 + MOVW.P 4(RK), R19; \ + VMOV R19, x.S4; \ + VEOR t1.B16, x.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L1(x, y); \ + VEOR x.B16, t0.B16, t0.B16 #define SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) \ - MOVW.P 4(R9), R19; \ - VMOV R19, x.S[0]; \ - VEOR t1.B16, x.B16, x.B16; \ - VEOR t2.B16, x.B16, x.B16; \ - VEOR t3.B16, x.B16, x.B16; \ - SM4_TAO_L2(x, y); \ - VEOR x.B16, t0.B16, t0.B16; \ - VMOV t0.S[0], R2; \ - MOVW.P R2, 4(R10); \ - MOVW.P R2, -4(R11) + MOVW.P 4(R9), R19; \ + VMOV R19, x.S[0]; \ + VEOR t1.B16, x.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L2(x, y); \ + VEOR x.B16, t0.B16, t0.B16; \ + VMOV t0.S[0], R2; \ + MOVW.P R2, 4(R10); \ + MOVW.P R2, -4(R11) #define load_global_data_1() \ - LDP nibble_mask<>(SB), (R0, R1) \ - VMOV R0, NIBBLE_MASK.D[0] \ - VMOV R1, NIBBLE_MASK.D[1] \ - LDP m1_low<>(SB), (R0, R1) \ - VMOV R0, M1L.D[0] \ - VMOV R1, M1L.D[1] \ - LDP m1_high<>(SB), (R0, R1) \ - VMOV R0, M1H.D[0] \ - VMOV R1, M1H.D[1] \ - LDP m2_low<>(SB), (R0, R1) \ - VMOV R0, M2L.D[0] \ - VMOV R1, M2L.D[1] \ - LDP m2_high<>(SB), (R0, R1) \ - VMOV R0, M2H.D[0] \ - VMOV R1, M2H.D[1] \ - LDP fk_mask<>(SB), (R0, R1) \ - VMOV R0, FK_MASK.D[0] \ - VMOV R1, FK_MASK.D[1] \ - LDP inverse_shift_rows<>(SB), (R0, R1) \ - VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ - VMOV R1, INVERSE_SHIFT_ROWS.D[1] + LDP nibble_mask<>(SB), (R0, R1) \ + VMOV R0, NIBBLE_MASK.D[0] \ + VMOV R1, NIBBLE_MASK.D[1] \ + LDP m1_low<>(SB), (R0, R1) \ + VMOV R0, M1L.D[0] \ + VMOV R1, M1L.D[1] \ + LDP m1_high<>(SB), (R0, R1) \ + VMOV R0, M1H.D[0] \ + VMOV R1, M1H.D[1] \ + LDP m2_low<>(SB), (R0, R1) \ + VMOV R0, M2L.D[0] \ + VMOV R1, M2L.D[1] \ + LDP m2_high<>(SB), (R0, R1) \ + VMOV R0, M2H.D[0] \ + VMOV R1, M2H.D[1] \ + LDP fk_mask<>(SB), (R0, R1) \ + VMOV R0, FK_MASK.D[0] \ + VMOV R1, FK_MASK.D[1] \ + LDP inverse_shift_rows<>(SB), (R0, R1) \ + VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ + VMOV R1, INVERSE_SHIFT_ROWS.D[1] #define load_global_data_2() \ - load_global_data_1() \ - LDP r08_mask<>(SB), (R0, R1) \ - VMOV R0, R08_MASK.D[0] \ - VMOV R1, R08_MASK.D[1] \ - LDP r16_mask<>(SB), (R0, R1) \ - VMOV R0, R16_MASK.D[0] \ - VMOV R1, R16_MASK.D[1] \ - LDP r24_mask<>(SB), (R0, R1) \ - VMOV R0, R24_MASK.D[0] \ - VMOV R1, R24_MASK.D[1] + load_global_data_1() \ + LDP r08_mask<>(SB), (R0, R1) \ + VMOV R0, R08_MASK.D[0] \ + VMOV R1, R08_MASK.D[1] \ + LDP r16_mask<>(SB), (R0, R1) \ + VMOV R0, R16_MASK.D[0] \ + VMOV R1, R16_MASK.D[1] \ + LDP r24_mask<>(SB), (R0, R1) \ + VMOV R0, R24_MASK.D[0] \ + VMOV R1, R24_MASK.D[1] #define SM4EKEY_EXPORT_KEYS() \ - VMOV V9.S[3], V10.S[0] \ - VMOV V9.S[2], V10.S[1] \ - VMOV V9.S[1], V10.S[2] \ - VMOV V9.S[0], V10.S[3] \ - VMOV V8.S[3], V11.S[0] \ - VMOV V8.S[2], V11.S[1] \ - VMOV V8.S[1], V11.S[2] \ - VMOV V8.S[0], V11.S[3] \ - VST1.P [V8.S4, V9.S4], 32(R10) \ - VST1 [V10.S4, V11.S4], (R11) \ - SUB $32, R11, R11 + VMOV V9.S[3], V10.S[0] \ + VMOV V9.S[2], V10.S[1] \ + VMOV V9.S[1], V10.S[2] \ + VMOV V9.S[0], V10.S[3] \ + VMOV V8.S[3], V11.S[0] \ + VMOV V8.S[2], V11.S[1] \ + VMOV V8.S[1], V11.S[2] \ + VMOV V8.S[0], V11.S[3] \ + VST1.P [V8.S4, V9.S4], 32(R10) \ + VST1 [V10.S4, V11.S4], (R11) \ + SUB $32, R11, R11 #define SM4E_ROUND() \ - VLD1.P 16(R10), [V8.B16] \ - VREV32 V8.B16, V8.B16 \ - WORD $0x0884c0ce \ - WORD $0x2884c0ce \ - WORD $0x4884c0ce \ - WORD $0x6884c0ce \ - WORD $0x8884c0ce \ - WORD $0xa884c0ce \ - WORD $0xc884c0ce \ - WORD $0xe884c0ce \ - VREV32 V8.B16, V8.B16 \ - VST1.P [V8.B16], 16(R9) + VLD1.P 16(R10), [V8.B16] \ + VREV32 V8.B16, V8.B16 \ + WORD $0x0884c0ce \ + WORD $0x2884c0ce \ + WORD $0x4884c0ce \ + WORD $0x6884c0ce \ + WORD $0x8884c0ce \ + WORD $0xa884c0ce \ + WORD $0xc884c0ce \ + WORD $0xe884c0ce \ + VREV32 V8.B16, V8.B16 \ + VST1.P [V8.B16], 16(R9) // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) TEXT ·expandKeyAsm(SB),NOSPLIT,$0 - MOVD key+0(FP), R8 - MOVD ck+8(FP), R9 - MOVD enc+16(FP), R10 - MOVD dec+24(FP), R11 - MOVD inst+32(FP), R12 + MOVD key+0(FP), R8 + MOVD ck+8(FP), R9 + MOVD enc+16(FP), R10 + MOVD dec+24(FP), R11 + MOVD inst+32(FP), R12 - CMP $1, R12 - BEQ sm4ekey + CMP $1, R12 + BEQ sm4ekey - load_global_data_1() + load_global_data_1() - VLD1 (R8), [t0.B16] - VREV32 t0.B16, t0.B16 - VEOR t0.B16, FK_MASK.B16, t0.B16 - VMOV t0.S[1], t1.S[0] - VMOV t0.S[2], t2.S[0] - VMOV t0.S[3], t3.S[0] + VLD1 (R8), [t0.B16] + VREV32 t0.B16, t0.B16 + VEOR t0.B16, FK_MASK.B16, t0.B16 + VMOV t0.S[1], t1.S[0] + VMOV t0.S[2], t2.S[0] + VMOV t0.S[3], t3.S[0] - EOR R0, R0 - ADD $124, R11 - VEOR ZERO.B16, ZERO.B16, ZERO.B16 + EOR R0, R0 + ADD $124, R11 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 ksLoop: - SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) - SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0) - SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1) - SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2) + SM4_EXPANDKEY_ROUND(x, y, t0, t1, t2, t3) + SM4_EXPANDKEY_ROUND(x, y, t1, t2, t3, t0) + SM4_EXPANDKEY_ROUND(x, y, t2, t3, t0, t1) + SM4_EXPANDKEY_ROUND(x, y, t3, t0, t1, t2) - ADD $16, R0 - CMP $128, R0 - BNE ksLoop - RET + ADD $16, R0 + CMP $128, R0 + BNE ksLoop + RET sm4ekey: - LDP fk_mask<>(SB), (R0, R1) - VMOV R0, FK_MASK.D[0] - VMOV R1, FK_MASK.D[1] + LDP fk_mask<>(SB), (R0, R1) + VMOV R0, FK_MASK.D[0] + VMOV R1, FK_MASK.D[1] VLD1 (R8), [V9.B16] VREV32 V9.B16, V9.B16 VEOR FK_MASK.B16, V9.B16, V9.B16 - ADD $96, R11 + ADD $96, R11 VLD1.P 64(R9), [V0.S4, V1.S4, V2.S4, V3.S4] WORD $0x28c960ce //SM4EKEY V8.4S, V9.4S, V0.4S WORD $0x09c961ce //SM4EKEY V9.4S, V8.4S, V1.4S - SM4EKEY_EXPORT_KEYS() + SM4EKEY_EXPORT_KEYS() WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S @@ -252,141 +252,141 @@ sm4ekey: WORD $0x28c962ce //SM4EKEY V8.4S, V9.4S, V2.4S WORD $0x09c963ce //SM4EKEY V9.4S, V8.4S, V3.4S SM4EKEY_EXPORT_KEYS() - RET + RET // func encryptBlocksAsm(xk *uint32, dst, src []byte, inst int) TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 - MOVD xk+0(FP), R8 - MOVD dst+8(FP), R9 - MOVD src+32(FP), R10 - MOVD src_len+40(FP), R12 - MOVD inst+56(FP), R11 + MOVD xk+0(FP), R8 + MOVD dst+8(FP), R9 + MOVD src+32(FP), R10 + MOVD src_len+40(FP), R12 + MOVD inst+56(FP), R11 - CMP $1, R11 - BEQ sm4niblocks + CMP $1, R11 + BEQ sm4niblocks - VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4] - VMOV V5.S[0], t0.S[0] - VMOV V5.S[1], t1.S[0] - VMOV V5.S[2], t2.S[0] - VMOV V5.S[3], t3.S[0] + VLD1 (R10), [V5.S4, V6.S4, V7.S4, V8.S4] + VMOV V5.S[0], t0.S[0] + VMOV V5.S[1], t1.S[0] + VMOV V5.S[2], t2.S[0] + VMOV V5.S[3], t3.S[0] - VMOV V6.S[0], t0.S[1] - VMOV V6.S[1], t1.S[1] - VMOV V6.S[2], t2.S[1] - VMOV V6.S[3], t3.S[1] + VMOV V6.S[0], t0.S[1] + VMOV V6.S[1], t1.S[1] + VMOV V6.S[2], t2.S[1] + VMOV V6.S[3], t3.S[1] - VMOV V7.S[0], t0.S[2] - VMOV V7.S[1], t1.S[2] - VMOV V7.S[2], t2.S[2] - VMOV V7.S[3], t3.S[2] + VMOV V7.S[0], t0.S[2] + VMOV V7.S[1], t1.S[2] + VMOV V7.S[2], t2.S[2] + VMOV V7.S[3], t3.S[2] - VMOV V8.S[0], t0.S[3] - VMOV V8.S[1], t1.S[3] - VMOV V8.S[2], t2.S[3] - VMOV V8.S[3], t3.S[3] + VMOV V8.S[0], t0.S[3] + VMOV V8.S[1], t1.S[3] + VMOV V8.S[2], t2.S[3] + VMOV V8.S[3], t3.S[3] - load_global_data_2() + load_global_data_2() - VREV32 t0.B16, t0.B16 - VREV32 t1.B16, t1.B16 - VREV32 t2.B16, t2.B16 - VREV32 t3.B16, t3.B16 + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - EOR R0, R0 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + EOR R0, R0 encryptBlocksLoop: - SM4_ROUND(R8, x, y, t0, t1, t2, t3) - SM4_ROUND(R8, x, y, t1, t2, t3, t0) - SM4_ROUND(R8, x, y, t2, t3, t0, t1) - SM4_ROUND(R8, x, y, t3, t0, t1, t2) + SM4_ROUND(R8, x, y, t0, t1, t2, t3) + SM4_ROUND(R8, x, y, t1, t2, t3, t0) + SM4_ROUND(R8, x, y, t2, t3, t0, t1) + SM4_ROUND(R8, x, y, t3, t0, t1, t2) - ADD $16, R0 - CMP $128, R0 - BNE encryptBlocksLoop + ADD $16, R0 + CMP $128, R0 + BNE encryptBlocksLoop - VREV32 t0.B16, t0.B16 - VREV32 t1.B16, t1.B16 - VREV32 t2.B16, t2.B16 - VREV32 t3.B16, t3.B16 + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 - VMOV t3.S[0], V8.S[0] - VMOV t2.S[0], V8.S[1] - VMOV t1.S[0], V8.S[2] - VMOV t0.S[0], V8.S[3] - VST1.P [V8.B16], 16(R9) + VMOV t3.S[0], V8.S[0] + VMOV t2.S[0], V8.S[1] + VMOV t1.S[0], V8.S[2] + VMOV t0.S[0], V8.S[3] + VST1.P [V8.B16], 16(R9) - VMOV t3.S[1], V8.S[0] - VMOV t2.S[1], V8.S[1] - VMOV t1.S[1], V8.S[2] - VMOV t0.S[1], V8.S[3] - VST1.P [V8.B16], 16(R9) + VMOV t3.S[1], V8.S[0] + VMOV t2.S[1], V8.S[1] + VMOV t1.S[1], V8.S[2] + VMOV t0.S[1], V8.S[3] + VST1.P [V8.B16], 16(R9) - VMOV t3.S[2], V8.S[0] - VMOV t2.S[2], V8.S[1] - VMOV t1.S[2], V8.S[2] - VMOV t0.S[2], V8.S[3] - VST1.P [V8.B16], 16(R9) + VMOV t3.S[2], V8.S[0] + VMOV t2.S[2], V8.S[1] + VMOV t1.S[2], V8.S[2] + VMOV t0.S[2], V8.S[3] + VST1.P [V8.B16], 16(R9) - VMOV t3.S[3], V8.S[0] - VMOV t2.S[3], V8.S[1] - VMOV t1.S[3], V8.S[2] - VMOV t0.S[3], V8.S[3] - VST1 [V8.B16], (R9) - RET + VMOV t3.S[3], V8.S[0] + VMOV t2.S[3], V8.S[1] + VMOV t1.S[3], V8.S[2] + VMOV t0.S[3], V8.S[3] + VST1 [V8.B16], (R9) + RET sm4niblocks: - VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] - VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] + VLD1.P 64(R8), [V0.S4, V1.S4, V2.S4, V3.S4] + VLD1.P 64(R8), [V4.S4, V5.S4, V6.S4, V7.S4] sm4niblockloop: - SM4E_ROUND() + SM4E_ROUND() SUB $16, R12, R12 // message length - 16bytes, then compare with 16bytes CBNZ R12, sm4niblockloop - RET + RET // func encryptBlockAsm(xk *uint32, dst, src *byte, inst int) TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 - MOVD xk+0(FP), R8 - MOVD dst+8(FP), R9 - MOVD src+16(FP), R10 - MOVD inst+24(FP), R11 + MOVD xk+0(FP), R8 + MOVD dst+8(FP), R9 + MOVD src+16(FP), R10 + MOVD inst+24(FP), R11 - CMP $1, R11 - BEQ sm4niblock + CMP $1, R11 + BEQ sm4niblock - VLD1 (R10), [t0.S4] - VREV32 t0.B16, t0.B16 - VMOV t0.S[1], t1.S[0] - VMOV t0.S[2], t2.S[0] - VMOV t0.S[3], t3.S[0] + VLD1 (R10), [t0.S4] + VREV32 t0.B16, t0.B16 + VMOV t0.S[1], t1.S[0] + VMOV t0.S[2], t2.S[0] + VMOV t0.S[3], t3.S[0] - load_global_data_2() + load_global_data_2() - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - EOR R0, R0 + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + EOR R0, R0 encryptBlockLoop: - SM4_ROUND(R8, x, y, t0, t1, t2, t3) - SM4_ROUND(R8, x, y, t1, t2, t3, t0) - SM4_ROUND(R8, x, y, t2, t3, t0, t1) - SM4_ROUND(R8, x, y, t3, t0, t1, t2) + SM4_ROUND(R8, x, y, t0, t1, t2, t3) + SM4_ROUND(R8, x, y, t1, t2, t3, t0) + SM4_ROUND(R8, x, y, t2, t3, t0, t1) + SM4_ROUND(R8, x, y, t3, t0, t1, t2) - ADD $16, R0 - CMP $128, R0 - BNE encryptBlockLoop + ADD $16, R0 + CMP $128, R0 + BNE encryptBlockLoop - VREV32 t0.B16, t0.B16 - VREV32 t1.B16, t1.B16 - VREV32 t2.B16, t2.B16 - VREV32 t3.B16, t3.B16 + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 - VMOV t3.S[0], V8.S[0] - VMOV t2.S[0], V8.S[1] - VMOV t1.S[0], V8.S[2] - VMOV t0.S[0], V8.S[3] - VST1 [V8.B16], (R9) - RET + VMOV t3.S[0], V8.S[0] + VMOV t2.S[0], V8.S[1] + VMOV t1.S[0], V8.S[2] + VMOV t0.S[0], V8.S[3] + VST1 [V8.B16], (R9) + RET sm4niblock: VLD1 (R10), [V8.B16] @@ -403,4 +403,4 @@ sm4niblock: WORD $0x6884c0ce //SM4E V8.4S, V3.4S VREV32 V8.B16, V8.B16 VST1 [V8.B16], (R9) - RET + RET diff --git a/sm4/gcm_sm4ni_arm64.s b/sm4/gcm_sm4ni_arm64.s index dbde380..19eae24 100644 --- a/sm4/gcm_sm4ni_arm64.s +++ b/sm4/gcm_sm4ni_arm64.s @@ -60,73 +60,73 @@ VEOR T3.B16, ACCM.B16, ACCM.B16 #define sm4eEnc1block() \ - WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S - WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S - WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S - WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S - WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S - WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S - WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S - WORD $0x4087c0ce //SM4E V0.4S, V26.4S + WORD $0x6086c0ce \ //SM4E V0.4S, V19.4S + WORD $0x8086c0ce \ //SM4E V0.4S, V20.4S + WORD $0xa086c0ce \ //SM4E V0.4S, V21.4S + WORD $0xc086c0ce \ //SM4E V0.4S, V22.4S + WORD $0xe086c0ce \ //SM4E V0.4S, V23.4S + WORD $0x0087c0ce \ //SM4E V0.4S, V24.4S + WORD $0x2087c0ce \ //SM4E V0.4S, V25.4S + WORD $0x4087c0ce //SM4E V0.4S, V26.4S #define sm4eEnc8blocks() \ - sm4eEnc1block() \ - WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S - WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S - WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S - WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S - WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S - WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S - WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S - WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S - WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S - WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S - WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S - WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S - WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S - WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S - WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S - WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S - WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S - WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S - WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S - WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S - WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S - WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S - WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S - WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S - WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S - WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S - WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S - WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S - WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S - WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S - WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S - WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S - WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S - WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S - WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S - WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S - WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S - WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S - WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S - WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S - WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S - WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S - WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S - WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S - WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S - WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S - WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S - WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S - WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S - WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S - WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S - WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S - WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S - WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S - WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S - WORD $0x4787c0ce //SM4E V7.4S, V26.4S + sm4eEnc1block() \ + WORD $0x6186c0ce \ //SM4E V1.4S, V19.4S + WORD $0x8186c0ce \ //SM4E V1.4S, V20.4S + WORD $0xa186c0ce \ //SM4E V1.4S, V21.4S + WORD $0xc186c0ce \ //SM4E V1.4S, V22.4S + WORD $0xe186c0ce \ //SM4E V1.4S, V23.4S + WORD $0x0187c0ce \ //SM4E V1.4S, V24.4S + WORD $0x2187c0ce \ //SM4E V1.4S, V25.4S + WORD $0x4187c0ce \ //SM4E V1.4S, V26.4S + WORD $0x6286c0ce \ //SM4E V2.4S, V19.4S + WORD $0x8286c0ce \ //SM4E V2.4S, V20.4S + WORD $0xa286c0ce \ //SM4E V2.4S, V21.4S + WORD $0xc286c0ce \ //SM4E V2.4S, V22.4S + WORD $0xe286c0ce \ //SM4E V2.4S, V23.4S + WORD $0x0287c0ce \ //SM4E V2.4S, V24.4S + WORD $0x2287c0ce \ //SM4E V2.4S, V25.4S + WORD $0x4287c0ce \ //SM4E V2.4S, V26.4S + WORD $0x6386c0ce \ //SM4E V3.4S, V19.4S + WORD $0x8386c0ce \ //SM4E V3.4S, V20.4S + WORD $0xa386c0ce \ //SM4E V3.4S, V21.4S + WORD $0xc386c0ce \ //SM4E V3.4S, V22.4S + WORD $0xe386c0ce \ //SM4E V3.4S, V23.4S + WORD $0x0387c0ce \ //SM4E V3.4S, V24.4S + WORD $0x2387c0ce \ //SM4E V3.4S, V25.4S + WORD $0x4387c0ce \ //SM4E V3.4S, V26.4S + WORD $0x6486c0ce \ //SM4E V4.4S, V19.4S + WORD $0x8486c0ce \ //SM4E V4.4S, V20.4S + WORD $0xa486c0ce \ //SM4E V4.4S, V21.4S + WORD $0xc486c0ce \ //SM4E V4.4S, V22.4S + WORD $0xe486c0ce \ //SM4E V4.4S, V23.4S + WORD $0x0487c0ce \ //SM4E V4.4S, V24.4S + WORD $0x2487c0ce \ //SM4E V4.4S, V25.4S + WORD $0x4487c0ce \ //SM4E V4.4S, V26.4S + WORD $0x6586c0ce \ //SM4E V5.4S, V19.4S + WORD $0x8586c0ce \ //SM4E V5.4S, V20.4S + WORD $0xa586c0ce \ //SM4E V5.4S, V21.4S + WORD $0xc586c0ce \ //SM4E V5.4S, V22.4S + WORD $0xe586c0ce \ //SM4E V5.4S, V23.4S + WORD $0x0587c0ce \ //SM4E V5.4S, V24.4S + WORD $0x2587c0ce \ //SM4E V5.4S, V25.4S + WORD $0x4587c0ce \ //SM4E V5.4S, V26.4S + WORD $0x6686c0ce \ //SM4E V6.4S, V19.4S + WORD $0x8686c0ce \ //SM4E V6.4S, V20.4S + WORD $0xa686c0ce \ //SM4E V6.4S, V21.4S + WORD $0xc686c0ce \ //SM4E V6.4S, V22.4S + WORD $0xe686c0ce \ //SM4E V6.4S, V23.4S + WORD $0x0687c0ce \ //SM4E V6.4S, V24.4S + WORD $0x2687c0ce \ //SM4E V6.4S, V25.4S + WORD $0x4687c0ce \ //SM4E V6.4S, V26.4S + WORD $0x6786c0ce \ //SM4E V7.4S, V19.4S + WORD $0x8786c0ce \ //SM4E V7.4S, V20.4S + WORD $0xa786c0ce \ //SM4E V7.4S, V21.4S + WORD $0xc786c0ce \ //SM4E V7.4S, V22.4S + WORD $0xe786c0ce \ //SM4E V7.4S, V23.4S + WORD $0x0787c0ce \ //SM4E V7.4S, V24.4S + WORD $0x2787c0ce \ //SM4E V7.4S, V25.4S + WORD $0x4787c0ce //SM4E V7.4S, V26.4S // func gcmSm4niEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niEnc(SB),NOSPLIT,$0 @@ -193,15 +193,15 @@ octetsLoop: VADD B6.S4, INC.S4, B7.S4 VADD B7.S4, INC.S4, CTR.S4 - sm4eEnc8blocks() - VREV32 B0.B16, B0.B16 - VREV32 B1.B16, B1.B16 - VREV32 B2.B16, B2.B16 - VREV32 B3.B16, B3.B16 - VREV32 B4.B16, B4.B16 - VREV32 B5.B16, B5.B16 - VREV32 B6.B16, B6.B16 - VREV32 B7.B16, B7.B16 + sm4eEnc8blocks() + VREV32 B0.B16, B0.B16 + VREV32 B1.B16, B1.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + VREV32 B4.B16, B4.B16 + VREV32 B5.B16, B5.B16 + VREV32 B6.B16, B6.B16 + VREV32 B7.B16, B7.B16 // XOR plaintext and store ciphertext VLD1.P 32(srcPtr), [T1.B16, T2.B16] @@ -254,14 +254,14 @@ singlesLoop: BLT tail SUB $16, srcPtrLen - VMOV CTR.B16, B0.B16 + VMOV CTR.B16, B0.B16 VADD CTR.S4, INC.S4, CTR.S4 - sm4eEnc1block() - VREV32 B0.B16, B0.B16 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 singlesLast: - VLD1.P 16(srcPtr), [T0.B16] - VEOR T0.B16, B0.B16, B0.B16 + VLD1.P 16(srcPtr), [T0.B16] + VEOR T0.B16, B0.B16, B0.B16 encReduce: VST1.P [B0.B16], 16(dstPtr) @@ -315,9 +315,9 @@ ld1: VMOV H1, T3.B[0] ld0: MOVD ZR, srcPtrLen - VMOV CTR.B16, B0.B16 - sm4eEnc1block() - VREV32 B0.B16, B0.B16 + VMOV CTR.B16, B0.B16 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 tailLast: VEOR T0.B16, B0.B16, B0.B16 @@ -326,7 +326,7 @@ tailLast: done: VST1 [ACC0.B16], (tPtr) - RET + RET // func gcmSm4niDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, rk []uint32) TEXT ·gcmSm4niDec(SB),NOSPLIT,$0 @@ -381,15 +381,15 @@ octetsLoop: VADD B6.S4, INC.S4, B7.S4 VADD B7.S4, INC.S4, CTR.S4 - sm4eEnc8blocks() + sm4eEnc8blocks() VREV32 B0.B16, T1.B16 VREV32 B1.B16, T2.B16 - VREV32 B2.B16, B2.B16 - VREV32 B3.B16, B3.B16 - VREV32 B4.B16, B4.B16 - VREV32 B5.B16, B5.B16 - VREV32 B6.B16, B6.B16 - VREV32 B7.B16, B7.B16 + VREV32 B2.B16, B2.B16 + VREV32 B3.B16, B3.B16 + VREV32 B4.B16, B4.B16 + VREV32 B5.B16, B5.B16 + VREV32 B6.B16, B6.B16 + VREV32 B7.B16, B7.B16 VLD1.P 32(srcPtr), [B0.B16, B1.B16] VEOR B0.B16, T1.B16, T1.B16 @@ -443,17 +443,17 @@ singlesLoop: CMP $16, srcPtrLen BLT tail SUB $16, srcPtrLen - - VLD1.P 16(srcPtr), [T0.B16] - VREV64 T0.B16, B5.B16 - - VMOV CTR.B16, B0.B16 + + VLD1.P 16(srcPtr), [T0.B16] + VREV64 T0.B16, B5.B16 + + VMOV CTR.B16, B0.B16 VADD CTR.S4, INC.S4, CTR.S4 - sm4eEnc1block() - VREV32 B0.B16, B0.B16 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 singlesLast: - VEOR T0.B16, B0.B16, B0.B16 + VEOR T0.B16, B0.B16, B0.B16 VST1.P [B0.B16], 16(dstPtr) VEOR ACC0.B16, B5.B16, B5.B16 @@ -467,13 +467,13 @@ singlesLast: B singlesLoop tail: CBZ srcPtrLen, done - VMOV CTR.B16, B0.B16 - VADD CTR.S4, INC.S4, CTR.S4 - sm4eEnc1block() - VREV32 B0.B16, B0.B16 + VMOV CTR.B16, B0.B16 + VADD CTR.S4, INC.S4, CTR.S4 + sm4eEnc1block() + VREV32 B0.B16, B0.B16 tailLast: // Assuming it is safe to load past dstPtr due to the presence of the tag - // B5 stored last ciphertext + // B5 stored last ciphertext VLD1 (srcPtr), [B5.B16] VEOR B5.B16, B0.B16, B0.B16 @@ -522,4 +522,4 @@ ld0: done: VST1 [ACC0.B16], (tPtr) - RET + RET diff --git a/sm4_test/cbc_sm4_test.go b/sm4_test/cbc_sm4_test.go index f478d00..7097a56 100644 --- a/sm4_test/cbc_sm4_test.go +++ b/sm4_test/cbc_sm4_test.go @@ -64,6 +64,22 @@ var cbcSM4Tests = []struct { 0x62, 0xb5, 0xe7, 0x50, 0x44, 0xea, 0x24, 0xcc, 0x9b, 0x5e, 0x07, 0x48, 0x04, 0x89, 0xa2, 0x74, }, }, + { + "7 blocks", + []byte("0123456789ABCDEF"), + []byte("0123456789ABCDEF"), + []byte("Hello World Hello World Hello World Hello World Hello World Hello World Hello World Hello World Hello World Hell"), + []byte{ + 0xd3, 0x1e, 0x36, 0x83, 0xe4, 0xfc, 0x9b, 0x51, 0x6a, 0x2c, 0x0f, 0x98, 0x36, 0x76, 0xa9, 0xeb, + 0x1f, 0xdc, 0xc3, 0x2a, 0xf3, 0x84, 0x08, 0x97, 0x81, 0x57, 0xa2, 0x06, 0x5d, 0xe3, 0x4c, 0x6a, + 0xe0, 0x02, 0xd6, 0xe4, 0xf5, 0x66, 0x87, 0xc4, 0xcc, 0x54, 0x1d, 0x1f, 0x1c, 0xc4, 0x2f, 0xe6, + 0xe5, 0x1d, 0xea, 0x52, 0xb8, 0x0c, 0xc8, 0xbe, 0xae, 0xcc, 0x44, 0xa8, 0x51, 0x81, 0x08, 0x60, + 0xb6, 0x09, 0x7b, 0xb8, 0x7e, 0xdb, 0x53, 0x4b, 0xea, 0x2a, 0xc6, 0xa1, 0xe5, 0xa0, 0x2a, 0xe9, + 0x22, 0x65, 0x5b, 0xa3, 0xb9, 0xcc, 0x63, 0x92, 0x16, 0x0e, 0x2f, 0xf4, 0x3b, 0x93, 0x06, 0x82, + 0xb3, 0x8c, 0x26, 0x2e, 0x06, 0x51, 0x34, 0x2c, 0xe4, 0x3d, 0xd0, 0xc7, 0x2b, 0x8f, 0x31, 0x15, + 0xb7, 0x8f, 0xd0, 0x47, 0x45, 0x40, 0xec, 0x02, 0x1b, 0xef, 0xc1, 0xd2, 0xe5, 0xa2, 0x35, 0xd2, + }, + }, { "9 blocks", []byte("0123456789ABCDEF"),