diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index b6c49c8..6847857 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -319,17 +319,11 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVQ dst+8(FP), BX MOVQ src+16(FP), DX - PINSRD $0, 0(DX), t0 - PSHUFB flip_mask<>(SB), t0 - - PINSRD $0, 4(DX), t1 - PSHUFB flip_mask<>(SB), t1 - - PINSRD $0, 8(DX), t2 - PSHUFB flip_mask<>(SB), t2 - - PINSRD $0, 12(DX), t3 - PSHUFB flip_mask<>(SB), t3 + MOVUPS (DX), t0 + PSHUFB flip_mask<>(SB), t0 + PSHUFD $1, t0, t1 + PSHUFD $2, t0, t2 + PSHUFD $3, t0, t3 XORL CX, CX @@ -343,16 +337,14 @@ loop: CMPL CX, $4*32 JB loop - PSHUFB flip_mask<>(SB), t3 - PSHUFB flip_mask<>(SB), t2 - PSHUFB flip_mask<>(SB), t1 - PSHUFB flip_mask<>(SB), t0 - MOVUPS t3, 0(BX) - PEXTRD $0, t2, R8 - MOVL R8, 4(BX) - PEXTRD $0, t1, R8 - MOVL R8, 8(BX) - PEXTRD $0, t0, R8 - MOVL R8, 12(BX) + PEXTRD $0, t2, R8 + PINSRD $1, R8, t3 + PEXTRD $0, t1, R8 + PINSRD $2, R8, t3 + PEXTRD $0, t0, R8 + PINSRD $3, R8, t3 + PSHUFB flip_mask<>(SB), t3 + MOVUPS t3, (BX) + done_sm4: RET diff --git a/sm4/cbc_cipher_asm.go b/sm4/cbc_cipher_asm.go index 6b622da..7ec8155 100644 --- a/sm4/cbc_cipher_asm.go +++ b/sm4/cbc_cipher_asm.go @@ -12,16 +12,32 @@ import ( // Assert that sm4CipherAsm implements the cbcDecAble interfaces. var _ cbcDecAble = (*sm4CipherAsm)(nil) +var _ cbcDecAble = (*sm4CipherAsm)(nil) + +const cbcEncrypt = 1 +const cbcDecrypt = 0 type cbc struct { b *sm4CipherAsm iv []byte tmp []byte + enc int +} + +func (b *sm4CipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode { + var c cbc + c.b = b + c.enc = cbcEncrypt + c.iv = make([]byte, BlockSize) + c.tmp = make([]byte, BlockSize) + copy(c.iv, iv) + return &c } func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode { var c cbc c.b = b + c.enc = cbcDecrypt c.iv = make([]byte, BlockSize) c.tmp = make([]byte, BlockSize) copy(c.iv, iv) @@ -30,6 +46,9 @@ func (b *sm4CipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode { func (x *cbc) BlockSize() int { return BlockSize } +//go:noescape +func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) + func (x *cbc) CryptBlocks(dst, src []byte) { if len(src)%BlockSize != 0 { panic("cipher: input not full blocks") @@ -43,6 +62,10 @@ func (x *cbc) CryptBlocks(dst, src []byte) { if len(src) == 0 { return } + if x.enc == cbcEncrypt { + encryptBlocksChain(&x.b.enc[0], dst, src, &x.iv[0]) + return + } // For each block, we need to xor the decrypted data with the previous block's ciphertext (the iv). // To avoid making a copy each time, we loop over the blocks BACKWARDS. end := len(src) diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s new file mode 100644 index 0000000..13b2fb5 --- /dev/null +++ b/sm4/cbc_cipher_asm_amd64.s @@ -0,0 +1,87 @@ +//go:build amd64 && !generic +// +build amd64,!generic + +#include "textflag.h" + +#define x X0 +#define y X1 +#define t0 X2 +#define t1 X3 +#define t2 X4 +#define t3 X5 + +#define XTMP6 X6 +#define IV X8 + +#include "aesni_amd64.h" + +#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ + PINSRD $0, (index * 4)(RK)(IND*1), x; \ + PXOR t1, x; \ + PXOR t2, x; \ + PXOR t3, x; \ + SM4_TAO_L1(x, y, z); \ + PXOR x, t0 + +// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) +TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 +#define ctx BX +#define ptx DX +#define ptxLen DI + + MOVQ xk+0(FP), AX + MOVQ dst+8(FP), ctx + MOVQ src+32(FP), ptx + MOVQ src_len+40(FP), ptxLen + MOVQ iv+56(FP), SI + + MOVUPS (SI), IV + +loopSrc: + CMPQ ptxLen, $16 + JB done_sm4 + SUBQ $16, ptxLen + + MOVUPS (ptx), t0 + PXOR IV, t0 + + PSHUFB flip_mask<>(SB), t0 + PSHUFD $1, t0, t1 + PSHUFD $2, t0, t2 + PSHUFD $3, t0, t3 + + XORL CX, CX + +loopRound: + SM4_SINGLE_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) + SM4_SINGLE_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) + SM4_SINGLE_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) + SM4_SINGLE_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) + + ADDL $16, CX + CMPL CX, $4*32 + JB loopRound + + PEXTRD $0, t2, R8 + PINSRD $1, R8, t3 + PEXTRD $0, t1, R8 + PINSRD $2, R8, t3 + PEXTRD $0, t0, R8 + PINSRD $3, R8, t3 + PSHUFB flip_mask<>(SB), t3 + + MOVOU t3, IV + MOVUPS t3, (ctx) + + LEAQ 16(ptx), ptx + LEAQ 16(ctx), ctx + + JMP loopSrc + +done_sm4: + MOVUPS IV, (SI) + RET + +#undef ctx +#undef ptx +#undef ptxLen diff --git a/sm4/cbc_cipher_asm_arm64.s b/sm4/cbc_cipher_asm_arm64.s new file mode 100644 index 0000000..a2b3ba4 --- /dev/null +++ b/sm4/cbc_cipher_asm_arm64.s @@ -0,0 +1,131 @@ +//go:build arm64 && !generic +// +build arm64,!generic + +#include "textflag.h" + +#define x V0 +#define y V1 +#define t0 V2 +#define t1 V3 +#define t2 V4 +#define t3 V5 +#define ZERO V16 +#define NIBBLE_MASK V20 +#define INVERSE_SHIFT_ROWS V21 +#define M1L V22 +#define M1H V23 +#define M2L V24 +#define M2H V25 +#define R08_MASK V26 +#define R16_MASK V27 +#define R24_MASK V28 +#define FK_MASK V29 +#define XTMP6 V6 +#define IV V7 + +#include "aesni_arm64.h" + +#define SM4_ROUND(RK, x, y, z, t0, t1, t2, t3) \ + MOVW.P 4(RK), R19; \ + VMOV R19, x.S4; \ + VEOR t1.B16, x.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L1(x, y, z); \ + VEOR x.B16, t0.B16, t0.B16 + +#define load_global_data_1() \ + LDP nibble_mask<>(SB), (R0, R1) \ + VMOV R0, NIBBLE_MASK.D[0] \ + VMOV R1, NIBBLE_MASK.D[1] \ + LDP m1_low<>(SB), (R0, R1) \ + VMOV R0, M1L.D[0] \ + VMOV R1, M1L.D[1] \ + LDP m1_high<>(SB), (R0, R1) \ + VMOV R0, M1H.D[0] \ + VMOV R1, M1H.D[1] \ + LDP m2_low<>(SB), (R0, R1) \ + VMOV R0, M2L.D[0] \ + VMOV R1, M2L.D[1] \ + LDP m2_high<>(SB), (R0, R1) \ + VMOV R0, M2H.D[0] \ + VMOV R1, M2H.D[1] \ + LDP fk_mask<>(SB), (R0, R1) \ + VMOV R0, FK_MASK.D[0] \ + VMOV R1, FK_MASK.D[1] \ + LDP inverse_shift_rows<>(SB), (R0, R1) \ + VMOV R0, INVERSE_SHIFT_ROWS.D[0] \ + VMOV R1, INVERSE_SHIFT_ROWS.D[1] + +#define load_global_data_2() \ + load_global_data_1() \ + LDP r08_mask<>(SB), (R0, R1) \ + VMOV R0, R08_MASK.D[0] \ + VMOV R1, R08_MASK.D[1] \ + LDP r16_mask<>(SB), (R0, R1) \ + VMOV R0, R16_MASK.D[0] \ + VMOV R1, R16_MASK.D[1] \ + LDP r24_mask<>(SB), (R0, R1) \ + VMOV R0, R24_MASK.D[0] \ + VMOV R1, R24_MASK.D[1] + +// func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) +TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 +#define ctx R1 +#define ptx R3 +#define ptxLen R4 +#define rkSave R8 + + load_global_data_2() + + MOVD xk+0(FP), rkSave + MOVD dst+8(FP), ctx + MOVD src+32(FP), ptx + MOVD src_len+40(FP), ptxLen + MOVD iv+56(FP), R5 + + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + +loopSrc: + CMP $16, ptxLen + BLT done_sm4 + SUB $16, ptxLen + + VLD1.P (ptx), [t0.S4] + VEOR IV.B16, t0.B16, t0.B16 + VREV32 t0.B16, t0.B16 + VMOV t0.S[1], t1.S[0] + VMOV t0.S[2], t2.S[0] + VMOV t0.S[3], t3.S[0] + + + EOR R2, R2 + MOVD rkSave, R0 + +encryptBlockLoop: + SM4_ROUND(R0, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(R0, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(R0, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(R0, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R2 + CMP $128, R2 + BNE encryptBlockLoop + + VMOV t2.S[0], t3.S[1] + VMOV t1.S[0], t3.S[2] + VMOV t0.S[0], t3.S[3] + VREV32 t3.B16, t3.B16 + + VST1.P [t3.B16], (ctx) + VMOV t3.B16, IV.B16 + + B loopSrc +done_sm4: + VST1 [IV.B16], (R5) + RET + +#undef ctx +#undef ptx +#undef ptxLen +#undef rkSave