diff --git a/cipher/benchmark_test.go b/cipher/benchmark_test.go index f2eec0e..e373a4e 100644 --- a/cipher/benchmark_test.go +++ b/cipher/benchmark_test.go @@ -32,7 +32,7 @@ func BenchmarkSM4CBCEncrypt1K(b *testing.B) { benchmarkCBCEncrypt1K(b, c) } -func benchmarkSM4CBCDecrypt1K(b *testing.B, block cipher.Block) { +func benchmarkCBCDecrypt1K(b *testing.B, block cipher.Block) { buf := make([]byte, 1024) b.SetBytes(int64(len(buf))) @@ -46,13 +46,13 @@ func benchmarkSM4CBCDecrypt1K(b *testing.B, block cipher.Block) { func BenchmarkAESCBCDecrypt1K(b *testing.B) { var key [16]byte c, _ := aes.NewCipher(key[:]) - benchmarkSM4CBCDecrypt1K(b, c) + benchmarkCBCDecrypt1K(b, c) } func BenchmarkSM4CBCDecrypt1K(b *testing.B) { var key [16]byte c, _ := sm4.NewCipher(key[:]) - benchmarkSM4CBCDecrypt1K(b, c) + benchmarkCBCDecrypt1K(b, c) } func benchmarkStream(b *testing.B, block cipher.Block, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) { diff --git a/sm2/sm2.go b/sm2/sm2.go index c770c41..12d0e69 100644 --- a/sm2/sm2.go +++ b/sm2/sm2.go @@ -883,7 +883,7 @@ type sm2Curve struct { nMinus2 []byte } -// pointFromAffine is used to convert the PublicKey to a nistec Point. +// pointFromAffine is used to convert the PublicKey to a sm2 Point. func (curve *sm2Curve) pointFromAffine(x, y *big.Int) (p *_sm2ec.SM2P256Point, err error) { bitSize := curve.curve.Params().BitSize // Reject values that would not get correctly encoded. @@ -902,7 +902,7 @@ func (curve *sm2Curve) pointFromAffine(x, y *big.Int) (p *_sm2ec.SM2P256Point, e return curve.newPoint().SetBytes(buf) } -// pointToAffine is used to convert a nistec Point to a PublicKey. +// pointToAffine is used to convert a sm2 Point to a PublicKey. func (curve *sm2Curve) pointToAffine(p *_sm2ec.SM2P256Point) (x, y *big.Int, err error) { out := p.Bytes() if len(out) == 1 && out[0] == 0 { diff --git a/sm4/aesni_amd64.h b/sm4/aesni_amd64.h index 099ce5c..67a9bc0 100644 --- a/sm4/aesni_amd64.h +++ b/sm4/aesni_amd64.h @@ -209,7 +209,7 @@ GLOBL fk_mask<>(SB), RODATA, $16 // - t2: 128 bits register for data // - t3: 128 bits register for data #define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ - PINSRD $0, (index * 4)(RK)(IND*1), x; \ + MOVL (index * 4)(RK)(IND*1), x; \ PXOR t1, x; \ PXOR t2, x; \ PXOR t3, x; \ diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 1d84433..b1f66bb 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -338,7 +338,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVQ dst+8(FP), BX MOVQ src+16(FP), DX - MOVUPS (DX), t0 + MOVOU (DX), t0 PSHUFB flip_mask<>(SB), t0 PSHUFD $1, t0, t1 PSHUFD $2, t0, t2 @@ -356,14 +356,12 @@ loop: CMPL CX, $4*32 JB loop - PEXTRD $0, t2, R8 - PINSRD $1, R8, t3 - PEXTRD $0, t1, R8 - PINSRD $2, R8, t3 - PEXTRD $0, t0, R8 - PINSRD $3, R8, t3 - PSHUFB flip_mask<>(SB), t3 - MOVUPS t3, (BX) + PALIGNR $4, t3, t3 + PALIGNR $4, t3, t2 + PALIGNR $4, t2, t1 + PALIGNR $4, t1, t0 + PSHUFB flip_mask<>(SB), t0 + MOVOU t0, (BX) done_sm4: RET diff --git a/sm4/cbc_cipher_asm.go b/sm4/cbc_cipher_asm.go index 82691ba..b539a4c 100644 --- a/sm4/cbc_cipher_asm.go +++ b/sm4/cbc_cipher_asm.go @@ -7,7 +7,6 @@ import ( "crypto/cipher" "github.com/emmansun/gmsm/internal/alias" - "github.com/emmansun/gmsm/internal/subtle" ) // Assert that sm4CipherAsm implements the cbcEncAble and cbcDecAble interfaces. @@ -49,6 +48,9 @@ func (x *cbc) BlockSize() int { return BlockSize } //go:noescape func encryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) +//go:noescape +func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) + func (x *cbc) CryptBlocks(dst, src []byte) { if len(src)%BlockSize != 0 { panic("cipher: input not full blocks") @@ -76,19 +78,18 @@ func (x *cbc) CryptBlocks(dst, src []byte) { var temp []byte = make([]byte, x.b.blocksSize) var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize) + decKeyPtr := &x.b.dec[0] for start > 0 { - x.b.DecryptBlocks(temp, src[start:end]) - subtle.XORBytes(temp, temp, src[start-BlockSize:end-BlockSize]) - copy(dst[start:], temp) + decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize]) end = start start -= x.b.blocksSize } // Handle remain first blocks - copy(batchSrc[BlockSize:], src[:end]) - x.b.DecryptBlocks(temp, batchSrc[BlockSize:]) copy(batchSrc, x.iv) - subtle.XORBytes(dst, temp[:end], batchSrc) + copy(batchSrc[BlockSize:], src[:end]) + decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0]) + copy(dst, temp[:end]) // Set the new iv to the first block we copied earlier. x.iv, x.tmp = x.tmp, x.iv diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s index b52edb0..201a925 100644 --- a/sm4/cbc_cipher_asm_amd64.s +++ b/sm4/cbc_cipher_asm_amd64.s @@ -34,7 +34,7 @@ loopSrc: JB done_sm4 SUBQ $16, ptxLen - MOVUPS (ptx), t0 + MOVOU (ptx), t0 PXOR IV, t0 PSHUFB flip_mask<>(SB), t0 @@ -54,16 +54,14 @@ loopRound: CMPL CX, $4*32 JB loopRound - PEXTRD $0, t2, R8 - PINSRD $1, R8, t3 - PEXTRD $0, t1, R8 - PINSRD $2, R8, t3 - PEXTRD $0, t0, R8 - PINSRD $3, R8, t3 - PSHUFB flip_mask<>(SB), t3 + PALIGNR $4, t3, t3 + PALIGNR $4, t3, t2 + PALIGNR $4, t2, t1 + PALIGNR $4, t1, t0 + PSHUFB flip_mask<>(SB), t0 - MOVOU t3, IV - MOVUPS t3, (ctx) + MOVOU t0, IV + MOVOU t0, (ctx) LEAQ 16(ptx), ptx LEAQ 16(ctx), ctx @@ -77,3 +75,162 @@ done_sm4: #undef ctx #undef ptx #undef ptxLen + +#define XDWTMP0 Y0 +#define XDWTMP1 Y1 +#define XDWTMP2 Y2 + +#define XDWORD0 Y4 +#define XDWORD1 Y5 +#define XDWORD2 Y6 +#define XDWORD3 Y7 + +#define NIBBLE_MASK Y3 +#define X_NIBBLE_MASK X3 +#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE + +#define XDWORD Y8 +#define YDWORD Y9 + +#define XWORD X8 +#define YWORD X9 + +// SM4 round function, AVX2 version, handle 256 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 256 bits temp register +// - y: 256 bits temp register +// - t0: 256 bits register for data as result +// - t1: 256 bits register for data +// - t2: 256 bits register for data +// - t3: 256 bits register for data +#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(AX)(CX*1), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \ + VPXOR x, t0, t0 + +// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) +TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 + MOVQ xk+0(FP), AX + MOVQ dst+8(FP), BX + MOVQ src+32(FP), DX + MOVQ iv+56(FP), SI + + CMPB ·useAVX2(SB), $1 + JE avx2 + +non_avx2_start: + PINSRD $0, 0(DX), t0 + PINSRD $1, 16(DX), t0 + PINSRD $2, 32(DX), t0 + PINSRD $3, 48(DX), t0 + PSHUFB flip_mask<>(SB), t0 + + PINSRD $0, 4(DX), t1 + PINSRD $1, 20(DX), t1 + PINSRD $2, 36(DX), t1 + PINSRD $3, 52(DX), t1 + PSHUFB flip_mask<>(SB), t1 + + PINSRD $0, 8(DX), t2 + PINSRD $1, 24(DX), t2 + PINSRD $2, 40(DX), t2 + PINSRD $3, 56(DX), t2 + PSHUFB flip_mask<>(SB), t2 + + PINSRD $0, 12(DX), t3 + PINSRD $1, 28(DX), t3 + PINSRD $2, 44(DX), t3 + PINSRD $3, 60(DX), t3 + PSHUFB flip_mask<>(SB), t3 + + XORL CX, CX + +loop: + SM4_ROUND(0, AX, CX, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(1, AX, CX, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(2, AX, CX, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(3, AX, CX, x, y, XTMP6, t3, t0, t1, t2) + + ADDL $16, CX + CMPL CX, $4*32 + JB loop + + PSHUFB flip_mask<>(SB), t3 + PSHUFB flip_mask<>(SB), t2 + PSHUFB flip_mask<>(SB), t1 + PSHUFB flip_mask<>(SB), t0 + + SSE_TRANSPOSE_MATRIX(CX, t3, t2, t1, t0, XWORD, YWORD) + + PXOR 0(SI), t3 + PXOR 16(SI), t2 + PXOR 32(SI), t1 + PXOR 48(SI), t0 + + MOVUPS t3, 0(BX) + MOVUPS t2, 16(BX) + MOVUPS t1, 32(BX) + MOVUPS t0, 48(BX) + +done_sm4: + RET + +avx2: + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + +avx2_8blocks: + VMOVDQU 0(DX), XDWORD0 + VMOVDQU 32(DX), XDWORD1 + VMOVDQU 64(DX), XDWORD2 + VMOVDQU 96(DX), XDWORD3 + VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK + + // Apply Byte Flip Mask: LE -> BE + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) + + XORL CX, CX + +avx2_loop: + AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + + ADDL $16, CX + CMPL CX, $4*32 + JB avx2_loop + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) + + VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + + VPXOR 0(SI), XDWORD0, XDWORD0 + VPXOR 32(SI), XDWORD1, XDWORD1 + VPXOR 64(SI), XDWORD2, XDWORD2 + VPXOR 96(SI), XDWORD3, XDWORD3 + + VMOVDQU XDWORD0, 0(BX) + VMOVDQU XDWORD1, 32(BX) + VMOVDQU XDWORD2, 64(BX) + VMOVDQU XDWORD3, 96(BX) + + +avx2_sm4_done: + VZEROUPPER + RET diff --git a/sm4/cbc_cipher_asm_arm64.s b/sm4/cbc_cipher_asm_arm64.s index 673e79a..ac85085 100644 --- a/sm4/cbc_cipher_asm_arm64.s +++ b/sm4/cbc_cipher_asm_arm64.s @@ -86,3 +86,50 @@ done_sm4: #undef ptx #undef ptxLen #undef rkSave + +// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) +TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 + LOAD_SM4_AESNI_CONSTS() + + MOVD xk+0(FP), R8 + MOVD dst+8(FP), R9 + MOVD src+32(FP), R10 + MOVD src_len+40(FP), R12 + MOVD iv+56(FP), R11 + + + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + EOR R0, R0 + +encryptBlocksLoop: + SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE encryptBlocksLoop + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + + VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4] + + VEOR V6.B16, t0.B16, t0.B16 + VEOR V7.B16, t1.B16, t1.B16 + VEOR V8.B16, t2.B16, t2.B16 + VEOR V9.B16, t3.B16, t3.B16 + + VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9) + RET