diff --git a/cipher/cbc_sm4_test.go b/cipher/cbc_sm4_test.go index fb8a9f4..6c2612d 100644 --- a/cipher/cbc_sm4_test.go +++ b/cipher/cbc_sm4_test.go @@ -3,8 +3,10 @@ package cipher_test import ( "bytes" "crypto/cipher" + "crypto/rand" "encoding/hex" "fmt" + "io" "testing" "github.com/emmansun/gmsm/padding" @@ -185,3 +187,24 @@ func TestCBCDecrypterSM4(t *testing.T) { } } } + +func TestSM4CBCRandom(t *testing.T) { + key := []byte("0123456789ABCDEF") + c, err := sm4.NewCipher(key) + if err != nil { + t.Fatal(err) + } + encrypter := cipher.NewCBCEncrypter(c, key) + decrypter := cipher.NewCBCDecrypter(c, key) + for i:=1; i<=50; i++ { + plaintext := make([]byte, i*16) + ciphertext := make([]byte, i*16) + got := make([]byte, i*16) + io.ReadFull(rand.Reader, plaintext) + encrypter.CryptBlocks(ciphertext, plaintext) + decrypter.CryptBlocks(got, ciphertext) + if !bytes.Equal(got, plaintext) { + t.Errorf("test %v blocks failed", i) + } + } +} diff --git a/sm4/cbc_cipher_asm.go b/sm4/cbc_cipher_asm.go index 9d9f7c8..423fa05 100644 --- a/sm4/cbc_cipher_asm.go +++ b/sm4/cbc_cipher_asm.go @@ -74,31 +74,7 @@ func (x *cbc) CryptBlocks(dst, src []byte) { // Copy the last block of ciphertext in preparation as the new iv. copy(x.tmp, src[end-BlockSize:end]) - - - decKeyPtr := &x.b.dec[0] - - start := end - 2*x.b.blocksSize - for start > 0 { - decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize]) - end = start - start -= 2*x.b.blocksSize - } - - start = end - x.b.blocksSize - for start > 0 { - decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize]) - end = start - start -= x.b.blocksSize - } - - // Handle remain first blocks - var temp []byte = make([]byte, x.b.blocksSize) - var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize) - copy(batchSrc, x.iv) - copy(batchSrc[BlockSize:], src[:end]) - decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0]) - copy(dst, temp[:end]) + decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0]) // Set the new iv to the first block we copied earlier. x.iv, x.tmp = x.tmp, x.iv diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s index 3357c3b..8d35e57 100644 --- a/sm4/cbc_cipher_asm_amd64.s +++ b/sm4/cbc_cipher_asm_amd64.s @@ -78,7 +78,6 @@ done_sm4: #define XDWTMP0 Y0 #define XDWTMP1 Y1 -#define XDWTMP2 Y2 #define XDWORD0 Y4 #define XDWORD1 Y5 @@ -110,6 +109,8 @@ done_sm4: #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE +#define BSWAP_MASK Y2 + #define XDWORD Y8 #define YDWORD Y9 @@ -124,36 +125,22 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 MOVQ src_len+40(FP), DI MOVQ iv+56(FP), SI + LEAQ (DX)(DI*1), DX + LEAQ (BX)(DI*1), BX + CMPB ·useAVX2(SB), $1 - JE avx2 + JE avx2Start CMPB ·useAVX(SB), $1 - JE avx + JE avxCbcSm4Octets -non_avx2_start: +cbcSm4Octets: CMPQ DI, $128 - JEQ sse_8blocks + JLE cbcSm4Nibbles + SUBQ $128, DI + LEAQ -128(DX), DX + LEAQ -128(BX), BX - MOVOU 0(DX), XWORD0 - MOVOU 16(DX), XWORD1 - MOVOU 32(DX), XWORD2 - MOVOU 48(DX), XWORD3 - - SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - - PXOR 0(SI), XWORD0 - PXOR 16(SI), XWORD1 - PXOR 32(SI), XWORD2 - PXOR 48(SI), XWORD3 - - MOVUPS XWORD0, 0(BX) - MOVUPS XWORD1, 16(BX) - MOVUPS XWORD2, 32(BX) - MOVUPS XWORD3, 48(BX) - - RET - -sse_8blocks: MOVOU 0(DX), XWORD0 MOVOU 16(DX), XWORD1 MOVOU 32(DX), XWORD2 @@ -165,14 +152,14 @@ sse_8blocks: SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) - PXOR 0(SI), XWORD0 - PXOR 16(SI), XWORD1 - PXOR 32(SI), XWORD2 - PXOR 48(SI), XWORD3 - PXOR 64(SI), XWORD4 - PXOR 80(SI), XWORD5 - PXOR 96(SI), XWORD6 - PXOR 112(SI), XWORD7 + PXOR -16(DX), XWORD0 + PXOR 0(DX), XWORD1 + PXOR 16(DX), XWORD2 + PXOR 32(DX), XWORD3 + PXOR 48(DX), XWORD4 + PXOR 64(DX), XWORD5 + PXOR 80(DX), XWORD6 + PXOR 96(DX), XWORD7 MOVOU XWORD0, 0(BX) MOVOU XWORD1, 16(BX) @@ -181,34 +168,110 @@ sse_8blocks: MOVOU XWORD4, 64(BX) MOVOU XWORD5, 80(BX) MOVOU XWORD6, 96(BX) - MOVOU XWORD7, 112(BX) + MOVOU XWORD7, 112(BX) -done_sm4: + JMP cbcSm4Octets + +cbcSm4Nibbles: + CMPQ DI, $64 + JLE cbCSm4Single + SUBQ $64, DI + LEAQ -64(DX), DX + LEAQ -64(BX), BX + + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 + + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + PXOR -16(DX), XWORD0 + PXOR 0(DX), XWORD1 + PXOR 16(DX), XWORD2 + PXOR 32(DX), XWORD3 + + MOVUPS XWORD0, 0(BX) + MOVUPS XWORD1, 16(BX) + MOVUPS XWORD2, 32(BX) + MOVUPS XWORD3, 48(BX) + +cbCSm4Single: + CMPQ DI, $16 + JEQ cbcSm4Single16 + + CMPQ DI, $32 + JEQ cbcSm4Single32 + + CMPQ DI, $48 + JEQ cbcSm4Single48 + + MOVOU -64(DX), XWORD0 + MOVOU -48(DX), XWORD1 + MOVOU -32(DX), XWORD2 + MOVOU -16(DX), XWORD3 + + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + PXOR 0(SI), XWORD0 + PXOR -64(DX), XWORD1 + PXOR -48(DX), XWORD2 + PXOR -32(DX), XWORD3 + + MOVUPS XWORD0, -64(BX) + MOVUPS XWORD1, -48(BX) + MOVUPS XWORD2, -32(BX) + MOVUPS XWORD3, -16(BX) + JMP cbcSm4Done + +cbcSm4Single16: + MOVOU -16(DX), XWORD0 + + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + PXOR 0(SI), XWORD0 + + MOVUPS XWORD0, -16(BX) + JMP cbcSm4Done + +cbcSm4Single32: + MOVOU -32(DX), XWORD0 + MOVOU -16(DX), XWORD1 + + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + PXOR 0(SI), XWORD0 + PXOR -32(DX), XWORD1 + + MOVUPS XWORD0, -32(BX) + MOVUPS XWORD1, -16(BX) + JMP cbcSm4Done + +cbcSm4Single48: + MOVOU -48(DX), XWORD0 + MOVOU -32(DX), XWORD1 + MOVOU -16(DX), XWORD2 + + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + PXOR 0(SI), XWORD0 + PXOR -48(DX), XWORD1 + PXOR -32(DX), XWORD2 + + MOVUPS XWORD0, -48(BX) + MOVUPS XWORD1, -32(BX) + MOVUPS XWORD2, -16(BX) + +cbcSm4Done: RET -avx: +avxCbcSm4Octets: CMPQ DI, $128 - JEQ avx_8blocks + JLE avxCbcSm4Nibbles + SUBQ $128, DI + LEAQ -128(DX), DX + LEAQ -128(BX), BX - VMOVDQU 0(DX), XWORD0 - VMOVDQU 16(DX), XWORD1 - VMOVDQU 32(DX), XWORD2 - VMOVDQU 48(DX), XWORD3 - - AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - - VPXOR 0(SI), XWORD0, XWORD0 - VPXOR 16(SI), XWORD1, XWORD1 - VPXOR 32(SI), XWORD2, XWORD2 - VPXOR 48(SI), XWORD3, XWORD3 - - VMOVDQU XWORD0, 0(BX) - VMOVDQU XWORD1, 16(BX) - VMOVDQU XWORD2, 32(BX) - VMOVDQU XWORD3, 48(BX) - RET - -avx_8blocks: VMOVDQU 0(DX), XWORD0 VMOVDQU 16(DX), XWORD1 VMOVDQU 32(DX), XWORD2 @@ -219,15 +282,15 @@ avx_8blocks: VMOVDQU 112(DX), XWORD7 AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) - - VPXOR 0(SI), XWORD0, XWORD0 - VPXOR 16(SI), XWORD1, XWORD1 - VPXOR 32(SI), XWORD2, XWORD2 - VPXOR 48(SI), XWORD3, XWORD3 - VPXOR 64(SI), XWORD4, XWORD4 - VPXOR 80(SI), XWORD5, XWORD5 - VPXOR 96(SI), XWORD6, XWORD6 - VPXOR 112(SI), XWORD7, XWORD7 + + VPXOR -16(DX), XWORD0, XWORD0 + VPXOR 0(DX), XWORD1, XWORD1 + VPXOR 16(DX), XWORD2, XWORD2 + VPXOR 32(DX), XWORD3, XWORD3 + VPXOR 48(DX), XWORD4, XWORD4 + VPXOR 64(DX), XWORD5, XWORD5 + VPXOR 80(DX), XWORD6, XWORD6 + VPXOR 96(DX), XWORD7, XWORD7 VMOVDQU XWORD0, 0(BX) VMOVDQU XWORD1, 16(BX) @@ -236,57 +299,115 @@ avx_8blocks: VMOVDQU XWORD4, 64(BX) VMOVDQU XWORD5, 80(BX) VMOVDQU XWORD6, 96(BX) - VMOVDQU XWORD7, 112(BX) + VMOVDQU XWORD7, 112(BX) -avx_sm4_done: + JMP avxCbcSm4Octets + +avxCbcSm4Nibbles: + CMPQ DI, $64 + JLE avxCbCSm4Single + SUBQ $64, DI + LEAQ -64(DX), DX + LEAQ -64(BX), BX + + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR -16(DX), XWORD0, XWORD0 + VPXOR 0(DX), XWORD1, XWORD1 + VPXOR 16(DX), XWORD2, XWORD2 + VPXOR 32(DX), XWORD3, XWORD3 + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + +avxCbCSm4Single: + CMPQ DI, $16 + JEQ avxCbcSm4Single16 + + CMPQ DI, $32 + JEQ avxCbcSm4Single32 + + CMPQ DI, $48 + JEQ avxCbcSm4Single48 + + VMOVDQU -64(DX), XWORD0 + VMOVDQU -48(DX), XWORD1 + VMOVDQU -32(DX), XWORD2 + VMOVDQU -16(DX), XWORD3 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR -64(DX), XWORD1, XWORD1 + VPXOR -48(DX), XWORD2, XWORD2 + VPXOR -32(DX), XWORD3, XWORD3 + + VMOVDQU XWORD0, -64(BX) + VMOVDQU XWORD1, -48(BX) + VMOVDQU XWORD2, -32(BX) + VMOVDQU XWORD3, -16(BX) + JMP avxCbcSm4Done + +avxCbcSm4Single16: + VMOVDQU -16(DX), XWORD0 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + + VMOVDQU XWORD0, -16(BX) + JMP avxCbcSm4Done + +avxCbcSm4Single32: + VMOVDQU -32(DX), XWORD0 + VMOVDQU -16(DX), XWORD1 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR -32(DX), XWORD1, XWORD1 + + VMOVDQU XWORD0, -32(BX) + VMOVDQU XWORD1, -16(BX) + JMP avxCbcSm4Done + +avxCbcSm4Single48: + VMOVDQU -48(DX), XWORD0 + VMOVDQU -32(DX), XWORD1 + VMOVDQU -16(DX), XWORD2 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR -48(DX), XWORD1, XWORD1 + VPXOR -32(DX), XWORD2, XWORD2 + + VMOVDQU XWORD0, -48(BX) + VMOVDQU XWORD1, -32(BX) + VMOVDQU XWORD2, -16(BX) + +avxCbcSm4Done: RET -avx2: +avx2Start: VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - CMPQ DI, $256 - JEQ avx2_16blocks - -avx2_8blocks: - VMOVDQU 0(DX), XDWORD0 - VMOVDQU 32(DX), XDWORD1 - VMOVDQU 64(DX), XDWORD2 - VMOVDQU 96(DX), XDWORD3 VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK - - // Apply Byte Flip Mask: LE -> BE - VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 - VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 - VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 - VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - - AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - - // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - - VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK - VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 - VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 - VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 - VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 - - VPXOR 0(SI), XDWORD0, XDWORD0 - VPXOR 32(SI), XDWORD1, XDWORD1 - VPXOR 64(SI), XDWORD2, XDWORD2 - VPXOR 96(SI), XDWORD3, XDWORD3 - - VMOVDQU XDWORD0, 0(BX) - VMOVDQU XDWORD1, 32(BX) - VMOVDQU XDWORD2, 64(BX) - VMOVDQU XDWORD3, 96(BX) - - VZEROUPPER - RET + VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK avx2_16blocks: + CMPQ DI, $256 + JLE avx2CbcSm4Octets + SUBQ $256, DI + LEAQ -256(DX), DX + LEAQ -256(BX), BX + VMOVDQU 0(DX), XDWORD0 VMOVDQU 32(DX), XDWORD1 VMOVDQU 64(DX), XDWORD2 @@ -296,8 +417,6 @@ avx2_16blocks: VMOVDQU 192(DX), XDWORD6 VMOVDQU 224(DX), XDWORD7 - VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK - // Apply Byte Flip Mask: LE -> BE VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 @@ -309,33 +428,32 @@ avx2_16blocks: VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) - VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK - VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 - VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 - VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 - VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 - VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 - VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 - VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 - VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 + VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 + VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 + VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 + VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 + VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4 + VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5 + VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6 + VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7 - VPXOR 0(SI), XDWORD0, XDWORD0 - VPXOR 32(SI), XDWORD1, XDWORD1 - VPXOR 64(SI), XDWORD2, XDWORD2 - VPXOR 96(SI), XDWORD3, XDWORD3 - VPXOR 128(SI), XDWORD4, XDWORD4 - VPXOR 160(SI), XDWORD5, XDWORD5 - VPXOR 192(SI), XDWORD6, XDWORD6 - VPXOR 224(SI), XDWORD7, XDWORD7 + VPXOR -16(DX), XDWORD0, XDWORD0 + VPXOR 16(DX), XDWORD1, XDWORD1 + VPXOR 48(DX), XDWORD2, XDWORD2 + VPXOR 80(DX), XDWORD3, XDWORD3 + VPXOR 112(DX), XDWORD4, XDWORD4 + VPXOR 144(DX), XDWORD5, XDWORD5 + VPXOR 176(DX), XDWORD6, XDWORD6 + VPXOR 208(DX), XDWORD7, XDWORD7 VMOVDQU XDWORD0, 0(BX) VMOVDQU XDWORD1, 32(BX) @@ -346,6 +464,141 @@ avx2_16blocks: VMOVDQU XDWORD6, 192(BX) VMOVDQU XDWORD7, 224(BX) -avx2_sm4_done: + JMP avx2_16blocks + +avx2CbcSm4Octets: + CMPQ DI, $128 + JLE avx2CbcSm4Nibbles + SUBQ $128, DI + LEAQ -128(DX), DX + LEAQ -128(BX), BX + + VMOVDQU 0(DX), XDWORD0 + VMOVDQU 32(DX), XDWORD1 + VMOVDQU 64(DX), XDWORD2 + VMOVDQU 96(DX), XDWORD3 + + // Apply Byte Flip Mask: LE -> BE + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + + AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + + VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 + VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 + VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 + VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 + + VPXOR -16(DX), XDWORD0, XDWORD0 + VPXOR 16(DX), XDWORD1, XDWORD1 + VPXOR 48(DX), XDWORD2, XDWORD2 + VPXOR 80(DX), XDWORD3, XDWORD3 + + VMOVDQU XDWORD0, 0(BX) + VMOVDQU XDWORD1, 32(BX) + VMOVDQU XDWORD2, 64(BX) + VMOVDQU XDWORD3, 96(BX) + + JMP avx2CbcSm4Octets + +avx2CbcSm4Nibbles: + CMPQ DI, $64 + JLE avx2CbCSm4Single + SUBQ $64, DI + LEAQ -64(DX), DX + LEAQ -64(BX), BX + + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR -16(DX), XWORD0, XWORD0 + VPXOR 0(DX), XWORD1, XWORD1 + VPXOR 16(DX), XWORD2, XWORD2 + VPXOR 32(DX), XWORD3, XWORD3 + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + +avx2CbCSm4Single: + CMPQ DI, $16 + JEQ avx2CbcSm4Single16 + + CMPQ DI, $32 + JEQ avx2CbcSm4Single32 + + CMPQ DI, $48 + JEQ avx2CbcSm4Single48 + + VMOVDQU -64(DX), XWORD0 + VMOVDQU -48(DX), XWORD1 + VMOVDQU -32(DX), XWORD2 + VMOVDQU -16(DX), XWORD3 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR -64(DX), XWORD1, XWORD1 + VPXOR -48(DX), XWORD2, XWORD2 + VPXOR -32(DX), XWORD3, XWORD3 + + VMOVDQU XWORD0, -64(BX) + VMOVDQU XWORD1, -48(BX) + VMOVDQU XWORD2, -32(BX) + VMOVDQU XWORD3, -16(BX) + JMP avx2CbcSm4Done + +avx2CbcSm4Single16: + VMOVDQU -16(DX), XWORD0 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + + VMOVDQU XWORD0, -16(BX) + JMP avx2CbcSm4Done + +avx2CbcSm4Single32: + VMOVDQU -32(DX), XWORD0 + VMOVDQU -16(DX), XWORD1 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR -32(DX), XWORD1, XWORD1 + + VMOVDQU XWORD0, -32(BX) + VMOVDQU XWORD1, -16(BX) + JMP avx2CbcSm4Done + +avx2CbcSm4Single48: + VMOVDQU -48(DX), XWORD0 + VMOVDQU -32(DX), XWORD1 + VMOVDQU -16(DX), XWORD2 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VPXOR 0(SI), XWORD0, XWORD0 + VPXOR -48(DX), XWORD1, XWORD1 + VPXOR -32(DX), XWORD2, XWORD2 + + VMOVDQU XWORD0, -48(BX) + VMOVDQU XWORD1, -32(BX) + VMOVDQU XWORD2, -16(BX) + +avx2CbcSm4Done: VZEROUPPER RET diff --git a/sm4/cbc_cipher_asm_arm64.s b/sm4/cbc_cipher_asm_arm64.s index 218c7aa..4cd9012 100644 --- a/sm4/cbc_cipher_asm_arm64.s +++ b/sm4/cbc_cipher_asm_arm64.s @@ -93,56 +93,35 @@ done_sm4: #define t6 V12 #define t7 V13 +#define dstPtr R1 +#define srcPtr R2 +#define rk R3 +#define rkSave R4 +#define srcPtrLen R5 +#define IV V18 + // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 LOAD_SM4_AESNI_CONSTS() - - MOVD xk+0(FP), R8 - MOVD dst+8(FP), R9 - MOVD src+32(FP), R10 - MOVD src_len+40(FP), R12 - MOVD iv+56(FP), R11 - - CMP $128, R12 - BEQ double_dec - - VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] - VREV32 t0.B16, t0.B16 - VREV32 t1.B16, t1.B16 - VREV32 t2.B16, t2.B16 - VREV32 t3.B16, t3.B16 - PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) - VEOR ZERO.B16, ZERO.B16, ZERO.B16 - EOR R0, R0 -encryptBlocksLoop: - SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3) - SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0) - SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1) - SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2) + MOVD xk+0(FP), rk + MOVD dst+8(FP), dstPtr + MOVD src+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD iv+56(FP), R6 + MOVD rk, rkSave + VLD1 (R6), [IV] - ADD $16, R0 - CMP $128, R0 - BNE encryptBlocksLoop +cbcSm4Octets: + CMP $128, srcPtrLen + BLE cbcSm4Nibbles + SUB $128, srcPtrLen + MOVD rkSave, rk + ADD srcPtr, srcPtrLen, R10 + SUB $16, R10, R11 + ADD dstPtr, srcPtrLen, R12 - TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) - VREV32 t0.B16, t0.B16 - VREV32 t1.B16, t1.B16 - VREV32 t2.B16, t2.B16 - VREV32 t3.B16, t3.B16 - - VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4] - - VEOR V6.B16, t0.B16, t0.B16 - VEOR V7.B16, t1.B16, t1.B16 - VEOR V8.B16, t2.B16, t2.B16 - VEOR V9.B16, t3.B16, t3.B16 - - VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9) - RET - -double_dec: VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4] VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4] VREV32 t0.B16, t0.B16 @@ -152,22 +131,21 @@ double_dec: VREV32 t4.B16, t4.B16 VREV32 t5.B16, t5.B16 VREV32 t6.B16, t6.B16 - VREV32 t7.B16, t7.B16 + VREV32 t7.B16, t7.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) - - VEOR ZERO.B16, ZERO.B16, ZERO.B16 EOR R0, R0 -decrypt8BlocksLoop: - SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) - SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) - SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) - SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) +cbc8BlocksLoop: + SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) + SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) + SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) + SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) ADD $16, R0 CMP $128, R0 - BNE decrypt8BlocksLoop + BNE cbc8BlocksLoop TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) @@ -192,7 +170,180 @@ decrypt8BlocksLoop: VEOR V8.B16, t6.B16, t6.B16 VEOR V9.B16, t7.B16, t7.B16 - VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9) - VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9) + VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R12) + VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R12) + B cbcSm4Octets + +cbcSm4Nibbles: + CMP $64, srcPtrLen + BLE cbcSm4Single + SUB $64, srcPtrLen + MOVD rkSave, rk + ADD srcPtr, srcPtrLen, R10 + SUB $16, R10, R11 + ADD dstPtr, srcPtrLen, R12 + + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + + EOR R0, R0 + +cbc4BlocksLoop: + SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE cbc4BlocksLoop + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + + VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4] + VEOR V6.B16, t0.B16, t0.B16 + VEOR V7.B16, t1.B16, t1.B16 + VEOR V8.B16, t2.B16, t2.B16 + VEOR V9.B16, t3.B16, t3.B16 + + VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12) + +cbcSm4Single: + MOVD rkSave, rk + EOR R0, R0 + MOVD srcPtr, R10 + + CMP $16, srcPtrLen + BEQ cbcSm4Single16 + + CMP $32, srcPtrLen + BEQ cbcSm4Single32 + + CMP $48, srcPtrLen + BEQ cbcSm4Single48 + + // 4 blocks + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + +cbc4BlocksLoop64: + SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE cbc4BlocksLoop64 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + + VLD1 (srcPtr), [V6.S4, V7.S4, V8.S4] + VEOR IV.B16, t0.B16, t0.B16 + VEOR V6.B16, t1.B16, t1.B16 + VEOR V7.B16, t2.B16, t2.B16 + VEOR V8.B16, t3.B16, t3.B16 + + VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr) + + B cbcSm4Done + +cbcSm4Single16: + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + +cbc4BlocksLoop16: + SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE cbc4BlocksLoop16 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + + VEOR IV.B16, t0.B16, t0.B16 + + VST1 [t0.S4], (dstPtr) + + B cbcSm4Done + +cbcSm4Single32: + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + +cbc4BlocksLoop32: + SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE cbc4BlocksLoop32 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + + VLD1 (srcPtr), [V6.S4] + VEOR IV.B16, t0.B16, t0.B16 + VEOR V6.B16, t1.B16, t1.B16 + + VST1 [t0.S4, t1.S4], (dstPtr) + B cbcSm4Done + +cbcSm4Single48: + VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + +cbc4BlocksLoop48: + SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE cbc4BlocksLoop48 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + + VLD1 (srcPtr), [V6.S4, V7.S4] + VEOR IV.B16, t0.B16, t0.B16 + VEOR V6.B16, t1.B16, t1.B16 + VEOR V7.B16, t2.B16, t2.B16 + + VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) + +cbcSm4Done: RET diff --git a/sm4/ecb_cipher_asm.go b/sm4/ecb_cipher_asm.go index 5cb136d..2c0458f 100644 --- a/sm4/ecb_cipher_asm.go +++ b/sm4/ecb_cipher_asm.go @@ -49,43 +49,17 @@ func (b *sm4CipherAsm) NewECBDecrypter() cipher.BlockMode { func (x *ecb) BlockSize() int { return BlockSize } +//go:noescape +func encryptSm4Ecb(xk *uint32, dst, src []byte) + func (x *ecb) CryptBlocks(dst, src []byte) { x.validate(dst, src) if len(src) == 0 { return } - for len(src) >= 2*x.b.blocksSize { - if x.enc == ecbEncrypt { - x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize]) - } else { - x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize]) - } - src = src[2*x.b.blocksSize:] - dst = dst[2*x.b.blocksSize:] - } - for len(src) >= x.b.blocksSize { - if x.enc == ecbEncrypt { - x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize]) - } else { - x.b.DecryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize]) - } - src = src[x.b.blocksSize:] - dst = dst[x.b.blocksSize:] - } - if len(src) > BlockSize { - temp := make([]byte, x.b.blocksSize) - copy(temp, src) - if x.enc == ecbEncrypt { - x.b.EncryptBlocks(temp, temp) - } else { - x.b.DecryptBlocks(temp, temp) - } - copy(dst, temp[:len(src)]) - } else if len(src) > 0 { - if x.enc == ecbEncrypt { - x.b.Encrypt(dst, src) - } else { - x.b.Decrypt(dst, src) - } + xk := &x.b.enc[0] + if x.enc == ecbDecrypt { + xk = &x.b.dec[0] } + encryptSm4Ecb(xk, dst, src) } diff --git a/sm4/ecb_sm4_amd64.s b/sm4/ecb_sm4_amd64.s new file mode 100644 index 0000000..d3643bd --- /dev/null +++ b/sm4/ecb_sm4_amd64.s @@ -0,0 +1,371 @@ +//go:build amd64 && !purego +// +build amd64,!purego + +#include "textflag.h" + +#include "aesni_macros_amd64.s" + +#define XDWTMP0 Y0 +#define XDWTMP1 Y1 + +#define XDWORD0 Y4 +#define XDWORD1 Y5 +#define XDWORD2 Y6 +#define XDWORD3 Y7 + +#define XDWORD4 Y10 +#define XDWORD5 Y11 +#define XDWORD6 Y12 +#define XDWORD7 Y14 + +#define XWTMP0 X0 +#define XWTMP1 X1 +#define XWTMP2 X2 + +#define XWORD0 X4 +#define XWORD1 X5 +#define XWORD2 X6 +#define XWORD3 X7 + +#define XWORD4 X10 +#define XWORD5 X11 +#define XWORD6 X12 +#define XWORD7 X14 + +#define NIBBLE_MASK Y3 +#define X_NIBBLE_MASK X3 + +#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE +#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE + +#define BSWAP_MASK Y2 + +#define XDWORD Y8 +#define YDWORD Y9 + +#define XWORD X8 +#define YWORD X9 + +// func encryptSm4Ecb(xk *uint32, dst, src []byte) +TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0 + MOVQ xk+0(FP), AX + MOVQ dst+8(FP), BX + MOVQ src+32(FP), DX + MOVQ src_len+40(FP), DI + + CMPB ·useAVX2(SB), $1 + JE avx2_start + + CMPB ·useAVX(SB), $1 + JE avxEcbSm4Octets + +ecbSm4Octets: + CMPQ DI, $128 + JB ecbSm4Nibbles + SUBQ $128, DI + + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 + MOVOU 64(DX), XWORD4 + MOVOU 80(DX), XWORD5 + MOVOU 96(DX), XWORD6 + MOVOU 112(DX), XWORD7 + + SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) + + MOVOU XWORD0, 0(BX) + MOVOU XWORD1, 16(BX) + MOVOU XWORD2, 32(BX) + MOVOU XWORD3, 48(BX) + MOVOU XWORD4, 64(BX) + MOVOU XWORD5, 80(BX) + MOVOU XWORD6, 96(BX) + MOVOU XWORD7, 112(BX) + + LEAQ 128(BX), BX + LEAQ 128(DX), DX + JMP ecbSm4Octets + +ecbSm4Nibbles: + CMPQ DI, $64 + JB ecbSm4Single + SUBQ $64, DI + + MOVOU 0(DX), XWORD0 + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + MOVOU 48(DX), XWORD3 + + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + MOVUPS XWORD0, 0(BX) + MOVUPS XWORD1, 16(BX) + MOVUPS XWORD2, 32(BX) + MOVUPS XWORD3, 48(BX) + + LEAQ 64(BX), BX + LEAQ 64(DX), DX + +ecbSm4Single: + TESTQ DI, DI + JE ecbSm4Done + + MOVOU 0(DX), XWORD0 + CMPQ DI, $32 + JEQ ecbSm4Single32 + CMPQ DI, $48 + JEQ ecbSm4Single48 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + MOVUPS XWORD0, 0(BX) + JMP ecbSm4Done + +ecbSm4Single32: + MOVOU 16(DX), XWORD1 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + MOVUPS XWORD0, 0(BX) + MOVUPS XWORD1, 16(BX) + JMP ecbSm4Done + +ecbSm4Single48: + MOVOU 16(DX), XWORD1 + MOVOU 32(DX), XWORD2 + SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + MOVUPS XWORD0, 0(BX) + MOVUPS XWORD1, 16(BX) + MOVUPS XWORD2, 32(BX) + +ecbSm4Done: + RET + +avxEcbSm4Octets: + CMPQ DI, $128 + JB avxEcbSm4Nibbles + SUBQ $128, DI + + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + VMOVDQU 64(DX), XWORD4 + VMOVDQU 80(DX), XWORD5 + VMOVDQU 96(DX), XWORD6 + VMOVDQU 112(DX), XWORD7 + + AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + VMOVDQU XWORD4, 64(BX) + VMOVDQU XWORD5, 80(BX) + VMOVDQU XWORD6, 96(BX) + VMOVDQU XWORD7, 112(BX) + + LEAQ 128(BX), BX + LEAQ 128(DX), DX + JMP avxEcbSm4Octets + +avxEcbSm4Nibbles: + CMPQ DI, $64 + JB avxEcbSm4Single + SUBQ $64, DI + + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + + LEAQ 64(BX), BX + LEAQ 64(DX), DX + +avxEcbSm4Single: + TESTQ DI, DI + JE avxEcbSm4Done + + VMOVDQU 0(DX), XWORD0 + CMPQ DI, $32 + JEQ avxEcbSm4Single32 + CMPQ DI, $48 + JEQ avxEcbSm4Single48 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + VMOVDQU XWORD0, 0(BX) + JMP avxEcbSm4Done + +avxEcbSm4Single32: + VMOVDQU 16(DX), XWORD1 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + JMP avxEcbSm4Done + +avxEcbSm4Single48: + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + +avxEcbSm4Done: + RET + +avx2_start: + VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK + VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK + +avx2_16blocks: + CMPQ DI, $256 + JB avx2EcbSm4Octets + SUBQ $256, DI + + VMOVDQU 0(DX), XDWORD0 + VMOVDQU 32(DX), XDWORD1 + VMOVDQU 64(DX), XDWORD2 + VMOVDQU 96(DX), XDWORD3 + VMOVDQU 128(DX), XDWORD4 + VMOVDQU 160(DX), XDWORD5 + VMOVDQU 192(DX), XDWORD6 + VMOVDQU 224(DX), XDWORD7 + + // Apply Byte Flip Mask: LE -> BE + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4 + VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5 + VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6 + VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) + + AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1) + + VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 + VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 + VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 + VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 + VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4 + VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5 + VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6 + VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7 + + VMOVDQU XDWORD0, 0(BX) + VMOVDQU XDWORD1, 32(BX) + VMOVDQU XDWORD2, 64(BX) + VMOVDQU XDWORD3, 96(BX) + VMOVDQU XDWORD4, 128(BX) + VMOVDQU XDWORD5, 160(BX) + VMOVDQU XDWORD6, 192(BX) + VMOVDQU XDWORD7, 224(BX) + + LEAQ 256(BX), BX + LEAQ 256(DX), DX + JMP avx2_16blocks + +avx2EcbSm4Octets: + CMPQ DI, $128 + JB avx2EcbSm4Nibbles + SUBQ $128, DI + + VMOVDQU 0(DX), XDWORD0 + VMOVDQU 32(DX), XDWORD1 + VMOVDQU 64(DX), XDWORD2 + VMOVDQU 96(DX), XDWORD3 + + // Apply Byte Flip Mask: LE -> BE + VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 + VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 + VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 + VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3 + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + + AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + + // Transpose matrix 4 x 4 32bits word + TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1) + + VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0 + VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1 + VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2 + VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3 + + VMOVDQU XDWORD0, 0(BX) + VMOVDQU XDWORD1, 32(BX) + VMOVDQU XDWORD2, 64(BX) + VMOVDQU XDWORD3, 96(BX) + + LEAQ 128(BX), BX + LEAQ 128(DX), DX + JMP avx2EcbSm4Octets + +avx2EcbSm4Nibbles: + CMPQ DI, $64 + JB avx2EcbSm4Single + SUBQ $64, DI + + VMOVDQU 0(DX), XWORD0 + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + VMOVDQU 48(DX), XWORD3 + + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + VMOVDQU XWORD3, 48(BX) + + LEAQ 64(BX), BX + LEAQ 64(DX), DX + +avx2EcbSm4Single: + TESTQ DI, DI + JE avx2EcbSm4Done + + VMOVDQU 0(DX), XWORD0 + CMPQ DI, $32 + JEQ avx2EcbSm4Single32 + CMPQ DI, $48 + JEQ avx2EcbSm4Single48 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + VMOVDQU XWORD0, 0(BX) + JMP avx2EcbSm4Done + +avx2EcbSm4Single32: + VMOVDQU 16(DX), XWORD1 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + JMP avx2EcbSm4Done + +avx2EcbSm4Single48: + VMOVDQU 16(DX), XWORD1 + VMOVDQU 32(DX), XWORD2 + AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) + VMOVDQU XWORD0, 0(BX) + VMOVDQU XWORD1, 16(BX) + VMOVDQU XWORD2, 32(BX) + +avx2EcbSm4Done: + VZEROUPPER + RET diff --git a/sm4/ecb_sm4_arm64.s b/sm4/ecb_sm4_arm64.s new file mode 100644 index 0000000..fddb197 --- /dev/null +++ b/sm4/ecb_sm4_arm64.s @@ -0,0 +1,207 @@ +//go:build arm64 && !purego +// +build arm64,!purego + +#include "textflag.h" + +#include "textflag.h" + +#define x V0 +#define y V1 +#define t0 V2 +#define t1 V3 +#define t2 V4 +#define t3 V5 +#define ZERO V16 +#define NIBBLE_MASK V20 +#define INVERSE_SHIFT_ROWS V21 +#define M1L V22 +#define M1H V23 +#define M2L V24 +#define M2H V25 +#define R08_MASK V26 +#define R16_MASK V27 +#define R24_MASK V28 +#define FK_MASK V29 +#define XTMP6 V6 +#define XTMP7 V7 +#define t4 V10 +#define t5 V11 +#define t6 V12 +#define t7 V13 + +#include "aesni_macros_arm64.s" + +// func encryptSm4Ecb(xk *uint32, dst, src []byte) +TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0 +#define dstPtr R1 +#define srcPtr R2 +#define rk R3 +#define rkSave R4 +#define srcPtrLen R5 + LOAD_SM4_AESNI_CONSTS() + VEOR ZERO.B16, ZERO.B16, ZERO.B16 + + MOVD xk+0(FP), rk + MOVD dst+8(FP), dstPtr + MOVD src+32(FP), srcPtr + MOVD src_len+40(FP), srcPtrLen + MOVD rk, rkSave + +ecbSm4Octets: + CMP $128, srcPtrLen + BLT ecbSm4Nibbles + SUB $128, srcPtrLen + MOVD rkSave, rk + + VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] + VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VREV32 t4.B16, t4.B16 + VREV32 t5.B16, t5.B16 + VREV32 t6.B16, t6.B16 + VREV32 t7.B16, t7.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) + + EOR R0, R0 + +ecb8BlocksLoop: + SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7) + SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4) + SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5) + SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6) + + ADD $16, R0 + CMP $128, R0 + BNE ecb8BlocksLoop + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VREV32 t4.B16, t4.B16 + VREV32 t5.B16, t5.B16 + VREV32 t6.B16, t6.B16 + VREV32 t7.B16, t7.B16 + + VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr) + VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr) + + B ecbSm4Octets + +ecbSm4Nibbles: + CMP $64, srcPtrLen + BLT ecbSm4Single + SUB $64, srcPtrLen + MOVD rkSave, rk + + VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + + EOR R0, R0 + +ecb4BlocksLoop: + SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE ecb4BlocksLoop + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VREV32 t3.B16, t3.B16 + VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr) + +ecbSm4Single: + CBZ srcPtrLen, ecbSm4Done + MOVD rkSave, rk + EOR R0, R0 + + CMP $16, srcPtrLen + BEQ ecbSm4Single16 + + CMP $32, srcPtrLen + BEQ ecbSm4Single32 + + CMP $48, srcPtrLen + BEQ ecbSm4Single48 + +ecbSm4Single16: + VLD1.P 16(srcPtr), [t0.S4] + VREV32 t0.B16, t0.B16 + +encryptBlocksLoop1: + SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE encryptBlocksLoop1 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VST1.P [t0.S4], 16(dstPtr) + + B ecbSm4Done + +ecbSm4Single32: + VLD1.P 32(srcPtr), [t0.S4, t1.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + +encryptBlocksLoop2: + SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE encryptBlocksLoop2 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VST1.P [t0.S4, t1.S4], 32(dstPtr) + + B ecbSm4Done + +ecbSm4Single48: + VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4] + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + +encryptBlocksLoop3: + SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3) + SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0) + SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1) + SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2) + + ADD $16, R0 + CMP $128, R0 + BNE encryptBlocksLoop3 + + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7) + VREV32 t0.B16, t0.B16 + VREV32 t1.B16, t1.B16 + VREV32 t2.B16, t2.B16 + VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr) +ecbSm4Done: + RET