sm4: cbc ecb mode enhancement

This commit is contained in:
Sun Yimin 2023-08-07 16:43:29 +08:00 committed by GitHub
parent e00fbe696d
commit 4e50b3dd6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1200 additions and 245 deletions

View File

@ -3,8 +3,10 @@ package cipher_test
import (
"bytes"
"crypto/cipher"
"crypto/rand"
"encoding/hex"
"fmt"
"io"
"testing"
"github.com/emmansun/gmsm/padding"
@ -185,3 +187,24 @@ func TestCBCDecrypterSM4(t *testing.T) {
}
}
}
func TestSM4CBCRandom(t *testing.T) {
key := []byte("0123456789ABCDEF")
c, err := sm4.NewCipher(key)
if err != nil {
t.Fatal(err)
}
encrypter := cipher.NewCBCEncrypter(c, key)
decrypter := cipher.NewCBCDecrypter(c, key)
for i:=1; i<=50; i++ {
plaintext := make([]byte, i*16)
ciphertext := make([]byte, i*16)
got := make([]byte, i*16)
io.ReadFull(rand.Reader, plaintext)
encrypter.CryptBlocks(ciphertext, plaintext)
decrypter.CryptBlocks(got, ciphertext)
if !bytes.Equal(got, plaintext) {
t.Errorf("test %v blocks failed", i)
}
}
}

View File

@ -74,31 +74,7 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
// Copy the last block of ciphertext in preparation as the new iv.
copy(x.tmp, src[end-BlockSize:end])
decKeyPtr := &x.b.dec[0]
start := end - 2*x.b.blocksSize
for start > 0 {
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
end = start
start -= 2*x.b.blocksSize
}
start = end - x.b.blocksSize
for start > 0 {
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
end = start
start -= x.b.blocksSize
}
// Handle remain first blocks
var temp []byte = make([]byte, x.b.blocksSize)
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
copy(batchSrc, x.iv)
copy(batchSrc[BlockSize:], src[:end])
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
copy(dst, temp[:end])
decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0])
// Set the new iv to the first block we copied earlier.
x.iv, x.tmp = x.tmp, x.iv

View File

@ -78,7 +78,6 @@ done_sm4:
#define XDWTMP0 Y0
#define XDWTMP1 Y1
#define XDWTMP2 Y2
#define XDWORD0 Y4
#define XDWORD1 Y5
@ -110,6 +109,8 @@ done_sm4:
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
#define BSWAP_MASK Y2
#define XDWORD Y8
#define YDWORD Y9
@ -124,36 +125,22 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVQ src_len+40(FP), DI
MOVQ iv+56(FP), SI
LEAQ (DX)(DI*1), DX
LEAQ (BX)(DI*1), BX
CMPB ·useAVX2(SB), $1
JE avx2
JE avx2Start
CMPB ·useAVX(SB), $1
JE avx
JE avxCbcSm4Octets
non_avx2_start:
cbcSm4Octets:
CMPQ DI, $128
JEQ sse_8blocks
JLE cbcSm4Nibbles
SUBQ $128, DI
LEAQ -128(DX), DX
LEAQ -128(BX), BX
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR 16(SI), XWORD1
PXOR 32(SI), XWORD2
PXOR 48(SI), XWORD3
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
MOVUPS XWORD2, 32(BX)
MOVUPS XWORD3, 48(BX)
RET
sse_8blocks:
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
@ -165,14 +152,14 @@ sse_8blocks:
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
PXOR 0(SI), XWORD0
PXOR 16(SI), XWORD1
PXOR 32(SI), XWORD2
PXOR 48(SI), XWORD3
PXOR 64(SI), XWORD4
PXOR 80(SI), XWORD5
PXOR 96(SI), XWORD6
PXOR 112(SI), XWORD7
PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
PXOR 48(DX), XWORD4
PXOR 64(DX), XWORD5
PXOR 80(DX), XWORD6
PXOR 96(DX), XWORD7
MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
@ -181,34 +168,110 @@ sse_8blocks:
MOVOU XWORD4, 64(BX)
MOVOU XWORD5, 80(BX)
MOVOU XWORD6, 96(BX)
MOVOU XWORD7, 112(BX)
MOVOU XWORD7, 112(BX)
done_sm4:
JMP cbcSm4Octets
cbcSm4Nibbles:
CMPQ DI, $64
JLE cbCSm4Single
SUBQ $64, DI
LEAQ -64(DX), DX
LEAQ -64(BX), BX
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
MOVUPS XWORD2, 32(BX)
MOVUPS XWORD3, 48(BX)
cbCSm4Single:
CMPQ DI, $16
JEQ cbcSm4Single16
CMPQ DI, $32
JEQ cbcSm4Single32
CMPQ DI, $48
JEQ cbcSm4Single48
MOVOU -64(DX), XWORD0
MOVOU -48(DX), XWORD1
MOVOU -32(DX), XWORD2
MOVOU -16(DX), XWORD3
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR -64(DX), XWORD1
PXOR -48(DX), XWORD2
PXOR -32(DX), XWORD3
MOVUPS XWORD0, -64(BX)
MOVUPS XWORD1, -48(BX)
MOVUPS XWORD2, -32(BX)
MOVUPS XWORD3, -16(BX)
JMP cbcSm4Done
cbcSm4Single16:
MOVOU -16(DX), XWORD0
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
MOVUPS XWORD0, -16(BX)
JMP cbcSm4Done
cbcSm4Single32:
MOVOU -32(DX), XWORD0
MOVOU -16(DX), XWORD1
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR -32(DX), XWORD1
MOVUPS XWORD0, -32(BX)
MOVUPS XWORD1, -16(BX)
JMP cbcSm4Done
cbcSm4Single48:
MOVOU -48(DX), XWORD0
MOVOU -32(DX), XWORD1
MOVOU -16(DX), XWORD2
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
PXOR 0(SI), XWORD0
PXOR -48(DX), XWORD1
PXOR -32(DX), XWORD2
MOVUPS XWORD0, -48(BX)
MOVUPS XWORD1, -32(BX)
MOVUPS XWORD2, -16(BX)
cbcSm4Done:
RET
avx:
avxCbcSm4Octets:
CMPQ DI, $128
JEQ avx_8blocks
JLE avxCbcSm4Nibbles
SUBQ $128, DI
LEAQ -128(DX), DX
LEAQ -128(BX), BX
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR 16(SI), XWORD1, XWORD1
VPXOR 32(SI), XWORD2, XWORD2
VPXOR 48(SI), XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
RET
avx_8blocks:
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
@ -219,15 +282,15 @@ avx_8blocks:
VMOVDQU 112(DX), XWORD7
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR 16(SI), XWORD1, XWORD1
VPXOR 32(SI), XWORD2, XWORD2
VPXOR 48(SI), XWORD3, XWORD3
VPXOR 64(SI), XWORD4, XWORD4
VPXOR 80(SI), XWORD5, XWORD5
VPXOR 96(SI), XWORD6, XWORD6
VPXOR 112(SI), XWORD7, XWORD7
VPXOR -16(DX), XWORD0, XWORD0
VPXOR 0(DX), XWORD1, XWORD1
VPXOR 16(DX), XWORD2, XWORD2
VPXOR 32(DX), XWORD3, XWORD3
VPXOR 48(DX), XWORD4, XWORD4
VPXOR 64(DX), XWORD5, XWORD5
VPXOR 80(DX), XWORD6, XWORD6
VPXOR 96(DX), XWORD7, XWORD7
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
@ -236,57 +299,115 @@ avx_8blocks:
VMOVDQU XWORD4, 64(BX)
VMOVDQU XWORD5, 80(BX)
VMOVDQU XWORD6, 96(BX)
VMOVDQU XWORD7, 112(BX)
VMOVDQU XWORD7, 112(BX)
avx_sm4_done:
JMP avxCbcSm4Octets
avxCbcSm4Nibbles:
CMPQ DI, $64
JLE avxCbCSm4Single
SUBQ $64, DI
LEAQ -64(DX), DX
LEAQ -64(BX), BX
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR -16(DX), XWORD0, XWORD0
VPXOR 0(DX), XWORD1, XWORD1
VPXOR 16(DX), XWORD2, XWORD2
VPXOR 32(DX), XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
avxCbCSm4Single:
CMPQ DI, $16
JEQ avxCbcSm4Single16
CMPQ DI, $32
JEQ avxCbcSm4Single32
CMPQ DI, $48
JEQ avxCbcSm4Single48
VMOVDQU -64(DX), XWORD0
VMOVDQU -48(DX), XWORD1
VMOVDQU -32(DX), XWORD2
VMOVDQU -16(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR -64(DX), XWORD1, XWORD1
VPXOR -48(DX), XWORD2, XWORD2
VPXOR -32(DX), XWORD3, XWORD3
VMOVDQU XWORD0, -64(BX)
VMOVDQU XWORD1, -48(BX)
VMOVDQU XWORD2, -32(BX)
VMOVDQU XWORD3, -16(BX)
JMP avxCbcSm4Done
avxCbcSm4Single16:
VMOVDQU -16(DX), XWORD0
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VMOVDQU XWORD0, -16(BX)
JMP avxCbcSm4Done
avxCbcSm4Single32:
VMOVDQU -32(DX), XWORD0
VMOVDQU -16(DX), XWORD1
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR -32(DX), XWORD1, XWORD1
VMOVDQU XWORD0, -32(BX)
VMOVDQU XWORD1, -16(BX)
JMP avxCbcSm4Done
avxCbcSm4Single48:
VMOVDQU -48(DX), XWORD0
VMOVDQU -32(DX), XWORD1
VMOVDQU -16(DX), XWORD2
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR -48(DX), XWORD1, XWORD1
VPXOR -32(DX), XWORD2, XWORD2
VMOVDQU XWORD0, -48(BX)
VMOVDQU XWORD1, -32(BX)
VMOVDQU XWORD2, -16(BX)
avxCbcSm4Done:
RET
avx2:
avx2Start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
CMPQ DI, $256
JEQ avx2_16blocks
avx2_8blocks:
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPXOR 0(SI), XDWORD0, XDWORD0
VPXOR 32(SI), XDWORD1, XDWORD1
VPXOR 64(SI), XDWORD2, XDWORD2
VPXOR 96(SI), XDWORD3, XDWORD3
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
VZEROUPPER
RET
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
avx2_16blocks:
CMPQ DI, $256
JLE avx2CbcSm4Octets
SUBQ $256, DI
LEAQ -256(DX), DX
LEAQ -256(BX), BX
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
@ -296,8 +417,6 @@ avx2_16blocks:
VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
@ -309,33 +428,32 @@ avx2_16blocks:
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
VPXOR 0(SI), XDWORD0, XDWORD0
VPXOR 32(SI), XDWORD1, XDWORD1
VPXOR 64(SI), XDWORD2, XDWORD2
VPXOR 96(SI), XDWORD3, XDWORD3
VPXOR 128(SI), XDWORD4, XDWORD4
VPXOR 160(SI), XDWORD5, XDWORD5
VPXOR 192(SI), XDWORD6, XDWORD6
VPXOR 224(SI), XDWORD7, XDWORD7
VPXOR -16(DX), XDWORD0, XDWORD0
VPXOR 16(DX), XDWORD1, XDWORD1
VPXOR 48(DX), XDWORD2, XDWORD2
VPXOR 80(DX), XDWORD3, XDWORD3
VPXOR 112(DX), XDWORD4, XDWORD4
VPXOR 144(DX), XDWORD5, XDWORD5
VPXOR 176(DX), XDWORD6, XDWORD6
VPXOR 208(DX), XDWORD7, XDWORD7
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
@ -346,6 +464,141 @@ avx2_16blocks:
VMOVDQU XDWORD6, 192(BX)
VMOVDQU XDWORD7, 224(BX)
avx2_sm4_done:
JMP avx2_16blocks
avx2CbcSm4Octets:
CMPQ DI, $128
JLE avx2CbcSm4Nibbles
SUBQ $128, DI
LEAQ -128(DX), DX
LEAQ -128(BX), BX
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
VPXOR -16(DX), XDWORD0, XDWORD0
VPXOR 16(DX), XDWORD1, XDWORD1
VPXOR 48(DX), XDWORD2, XDWORD2
VPXOR 80(DX), XDWORD3, XDWORD3
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
JMP avx2CbcSm4Octets
avx2CbcSm4Nibbles:
CMPQ DI, $64
JLE avx2CbCSm4Single
SUBQ $64, DI
LEAQ -64(DX), DX
LEAQ -64(BX), BX
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR -16(DX), XWORD0, XWORD0
VPXOR 0(DX), XWORD1, XWORD1
VPXOR 16(DX), XWORD2, XWORD2
VPXOR 32(DX), XWORD3, XWORD3
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
avx2CbCSm4Single:
CMPQ DI, $16
JEQ avx2CbcSm4Single16
CMPQ DI, $32
JEQ avx2CbcSm4Single32
CMPQ DI, $48
JEQ avx2CbcSm4Single48
VMOVDQU -64(DX), XWORD0
VMOVDQU -48(DX), XWORD1
VMOVDQU -32(DX), XWORD2
VMOVDQU -16(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR -64(DX), XWORD1, XWORD1
VPXOR -48(DX), XWORD2, XWORD2
VPXOR -32(DX), XWORD3, XWORD3
VMOVDQU XWORD0, -64(BX)
VMOVDQU XWORD1, -48(BX)
VMOVDQU XWORD2, -32(BX)
VMOVDQU XWORD3, -16(BX)
JMP avx2CbcSm4Done
avx2CbcSm4Single16:
VMOVDQU -16(DX), XWORD0
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VMOVDQU XWORD0, -16(BX)
JMP avx2CbcSm4Done
avx2CbcSm4Single32:
VMOVDQU -32(DX), XWORD0
VMOVDQU -16(DX), XWORD1
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR -32(DX), XWORD1, XWORD1
VMOVDQU XWORD0, -32(BX)
VMOVDQU XWORD1, -16(BX)
JMP avx2CbcSm4Done
avx2CbcSm4Single48:
VMOVDQU -48(DX), XWORD0
VMOVDQU -32(DX), XWORD1
VMOVDQU -16(DX), XWORD2
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VPXOR 0(SI), XWORD0, XWORD0
VPXOR -48(DX), XWORD1, XWORD1
VPXOR -32(DX), XWORD2, XWORD2
VMOVDQU XWORD0, -48(BX)
VMOVDQU XWORD1, -32(BX)
VMOVDQU XWORD2, -16(BX)
avx2CbcSm4Done:
VZEROUPPER
RET

View File

@ -93,56 +93,35 @@ done_sm4:
#define t6 V12
#define t7 V13
#define dstPtr R1
#define srcPtr R2
#define rk R3
#define rkSave R4
#define srcPtrLen R5
#define IV V18
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
LOAD_SM4_AESNI_CONSTS()
MOVD xk+0(FP), R8
MOVD dst+8(FP), R9
MOVD src+32(FP), R10
MOVD src_len+40(FP), R12
MOVD iv+56(FP), R11
CMP $128, R12
BEQ double_dec
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
encryptBlocksLoop:
SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
MOVD xk+0(FP), rk
MOVD dst+8(FP), dstPtr
MOVD src+32(FP), srcPtr
MOVD src_len+40(FP), srcPtrLen
MOVD iv+56(FP), R6
MOVD rk, rkSave
VLD1 (R6), [IV]
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop
cbcSm4Octets:
CMP $128, srcPtrLen
BLE cbcSm4Nibbles
SUB $128, srcPtrLen
MOVD rkSave, rk
ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R11
ADD dstPtr, srcPtrLen, R12
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
VEOR V6.B16, t0.B16, t0.B16
VEOR V7.B16, t1.B16, t1.B16
VEOR V8.B16, t2.B16, t2.B16
VEOR V9.B16, t3.B16, t3.B16
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
RET
double_dec:
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
VREV32 t0.B16, t0.B16
@ -152,22 +131,21 @@ double_dec:
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
VREV32 t7.B16, t7.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
VEOR ZERO.B16, ZERO.B16, ZERO.B16
EOR R0, R0
decrypt8BlocksLoop:
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
cbc8BlocksLoop:
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
ADD $16, R0
CMP $128, R0
BNE decrypt8BlocksLoop
BNE cbc8BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
@ -192,7 +170,180 @@ decrypt8BlocksLoop:
VEOR V8.B16, t6.B16, t6.B16
VEOR V9.B16, t7.B16, t7.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R12)
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R12)
B cbcSm4Octets
cbcSm4Nibbles:
CMP $64, srcPtrLen
BLE cbcSm4Single
SUB $64, srcPtrLen
MOVD rkSave, rk
ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R11
ADD dstPtr, srcPtrLen, R12
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
EOR R0, R0
cbc4BlocksLoop:
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE cbc4BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
VEOR V6.B16, t0.B16, t0.B16
VEOR V7.B16, t1.B16, t1.B16
VEOR V8.B16, t2.B16, t2.B16
VEOR V9.B16, t3.B16, t3.B16
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12)
cbcSm4Single:
MOVD rkSave, rk
EOR R0, R0
MOVD srcPtr, R10
CMP $16, srcPtrLen
BEQ cbcSm4Single16
CMP $32, srcPtrLen
BEQ cbcSm4Single32
CMP $48, srcPtrLen
BEQ cbcSm4Single48
// 4 blocks
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
cbc4BlocksLoop64:
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE cbc4BlocksLoop64
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VLD1 (srcPtr), [V6.S4, V7.S4, V8.S4]
VEOR IV.B16, t0.B16, t0.B16
VEOR V6.B16, t1.B16, t1.B16
VEOR V7.B16, t2.B16, t2.B16
VEOR V8.B16, t3.B16, t3.B16
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
B cbcSm4Done
cbcSm4Single16:
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
cbc4BlocksLoop16:
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE cbc4BlocksLoop16
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VEOR IV.B16, t0.B16, t0.B16
VST1 [t0.S4], (dstPtr)
B cbcSm4Done
cbcSm4Single32:
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
cbc4BlocksLoop32:
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE cbc4BlocksLoop32
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VLD1 (srcPtr), [V6.S4]
VEOR IV.B16, t0.B16, t0.B16
VEOR V6.B16, t1.B16, t1.B16
VST1 [t0.S4, t1.S4], (dstPtr)
B cbcSm4Done
cbcSm4Single48:
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
cbc4BlocksLoop48:
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE cbc4BlocksLoop48
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VLD1 (srcPtr), [V6.S4, V7.S4]
VEOR IV.B16, t0.B16, t0.B16
VEOR V6.B16, t1.B16, t1.B16
VEOR V7.B16, t2.B16, t2.B16
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
cbcSm4Done:
RET

View File

@ -49,43 +49,17 @@ func (b *sm4CipherAsm) NewECBDecrypter() cipher.BlockMode {
func (x *ecb) BlockSize() int { return BlockSize }
//go:noescape
func encryptSm4Ecb(xk *uint32, dst, src []byte)
func (x *ecb) CryptBlocks(dst, src []byte) {
x.validate(dst, src)
if len(src) == 0 {
return
}
for len(src) >= 2*x.b.blocksSize {
if x.enc == ecbEncrypt {
x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
} else {
x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
}
src = src[2*x.b.blocksSize:]
dst = dst[2*x.b.blocksSize:]
}
for len(src) >= x.b.blocksSize {
if x.enc == ecbEncrypt {
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
} else {
x.b.DecryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
}
src = src[x.b.blocksSize:]
dst = dst[x.b.blocksSize:]
}
if len(src) > BlockSize {
temp := make([]byte, x.b.blocksSize)
copy(temp, src)
if x.enc == ecbEncrypt {
x.b.EncryptBlocks(temp, temp)
} else {
x.b.DecryptBlocks(temp, temp)
}
copy(dst, temp[:len(src)])
} else if len(src) > 0 {
if x.enc == ecbEncrypt {
x.b.Encrypt(dst, src)
} else {
x.b.Decrypt(dst, src)
}
xk := &x.b.enc[0]
if x.enc == ecbDecrypt {
xk = &x.b.dec[0]
}
encryptSm4Ecb(xk, dst, src)
}

371
sm4/ecb_sm4_amd64.s Normal file
View File

@ -0,0 +1,371 @@
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
#include "aesni_macros_amd64.s"
#define XDWTMP0 Y0
#define XDWTMP1 Y1
#define XDWORD0 Y4
#define XDWORD1 Y5
#define XDWORD2 Y6
#define XDWORD3 Y7
#define XDWORD4 Y10
#define XDWORD5 Y11
#define XDWORD6 Y12
#define XDWORD7 Y14
#define XWTMP0 X0
#define XWTMP1 X1
#define XWTMP2 X2
#define XWORD0 X4
#define XWORD1 X5
#define XWORD2 X6
#define XWORD3 X7
#define XWORD4 X10
#define XWORD5 X11
#define XWORD6 X12
#define XWORD7 X14
#define NIBBLE_MASK Y3
#define X_NIBBLE_MASK X3
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
#define BSWAP_MASK Y2
#define XDWORD Y8
#define YDWORD Y9
#define XWORD X8
#define YWORD X9
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
MOVQ dst+8(FP), BX
MOVQ src+32(FP), DX
MOVQ src_len+40(FP), DI
CMPB ·useAVX2(SB), $1
JE avx2_start
CMPB ·useAVX(SB), $1
JE avxEcbSm4Octets
ecbSm4Octets:
CMPQ DI, $128
JB ecbSm4Nibbles
SUBQ $128, DI
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
MOVOU 64(DX), XWORD4
MOVOU 80(DX), XWORD5
MOVOU 96(DX), XWORD6
MOVOU 112(DX), XWORD7
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
MOVOU XWORD2, 32(BX)
MOVOU XWORD3, 48(BX)
MOVOU XWORD4, 64(BX)
MOVOU XWORD5, 80(BX)
MOVOU XWORD6, 96(BX)
MOVOU XWORD7, 112(BX)
LEAQ 128(BX), BX
LEAQ 128(DX), DX
JMP ecbSm4Octets
ecbSm4Nibbles:
CMPQ DI, $64
JB ecbSm4Single
SUBQ $64, DI
MOVOU 0(DX), XWORD0
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
MOVOU 48(DX), XWORD3
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
MOVUPS XWORD2, 32(BX)
MOVUPS XWORD3, 48(BX)
LEAQ 64(BX), BX
LEAQ 64(DX), DX
ecbSm4Single:
TESTQ DI, DI
JE ecbSm4Done
MOVOU 0(DX), XWORD0
CMPQ DI, $32
JEQ ecbSm4Single32
CMPQ DI, $48
JEQ ecbSm4Single48
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
MOVUPS XWORD0, 0(BX)
JMP ecbSm4Done
ecbSm4Single32:
MOVOU 16(DX), XWORD1
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
JMP ecbSm4Done
ecbSm4Single48:
MOVOU 16(DX), XWORD1
MOVOU 32(DX), XWORD2
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
MOVUPS XWORD2, 32(BX)
ecbSm4Done:
RET
avxEcbSm4Octets:
CMPQ DI, $128
JB avxEcbSm4Nibbles
SUBQ $128, DI
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
VMOVDQU 64(DX), XWORD4
VMOVDQU 80(DX), XWORD5
VMOVDQU 96(DX), XWORD6
VMOVDQU 112(DX), XWORD7
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
VMOVDQU XWORD4, 64(BX)
VMOVDQU XWORD5, 80(BX)
VMOVDQU XWORD6, 96(BX)
VMOVDQU XWORD7, 112(BX)
LEAQ 128(BX), BX
LEAQ 128(DX), DX
JMP avxEcbSm4Octets
avxEcbSm4Nibbles:
CMPQ DI, $64
JB avxEcbSm4Single
SUBQ $64, DI
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
LEAQ 64(BX), BX
LEAQ 64(DX), DX
avxEcbSm4Single:
TESTQ DI, DI
JE avxEcbSm4Done
VMOVDQU 0(DX), XWORD0
CMPQ DI, $32
JEQ avxEcbSm4Single32
CMPQ DI, $48
JEQ avxEcbSm4Single48
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
JMP avxEcbSm4Done
avxEcbSm4Single32:
VMOVDQU 16(DX), XWORD1
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
JMP avxEcbSm4Done
avxEcbSm4Single48:
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
avxEcbSm4Done:
RET
avx2_start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
avx2_16blocks:
CMPQ DI, $256
JB avx2EcbSm4Octets
SUBQ $256, DI
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VMOVDQU 128(DX), XDWORD4
VMOVDQU 160(DX), XDWORD5
VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
VMOVDQU XDWORD4, 128(BX)
VMOVDQU XDWORD5, 160(BX)
VMOVDQU XDWORD6, 192(BX)
VMOVDQU XDWORD7, 224(BX)
LEAQ 256(BX), BX
LEAQ 256(DX), DX
JMP avx2_16blocks
avx2EcbSm4Octets:
CMPQ DI, $128
JB avx2EcbSm4Nibbles
SUBQ $128, DI
VMOVDQU 0(DX), XDWORD0
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
VMOVDQU XDWORD0, 0(BX)
VMOVDQU XDWORD1, 32(BX)
VMOVDQU XDWORD2, 64(BX)
VMOVDQU XDWORD3, 96(BX)
LEAQ 128(BX), BX
LEAQ 128(DX), DX
JMP avx2EcbSm4Octets
avx2EcbSm4Nibbles:
CMPQ DI, $64
JB avx2EcbSm4Single
SUBQ $64, DI
VMOVDQU 0(DX), XWORD0
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
VMOVDQU 48(DX), XWORD3
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
VMOVDQU XWORD3, 48(BX)
LEAQ 64(BX), BX
LEAQ 64(DX), DX
avx2EcbSm4Single:
TESTQ DI, DI
JE avx2EcbSm4Done
VMOVDQU 0(DX), XWORD0
CMPQ DI, $32
JEQ avx2EcbSm4Single32
CMPQ DI, $48
JEQ avx2EcbSm4Single48
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
JMP avx2EcbSm4Done
avx2EcbSm4Single32:
VMOVDQU 16(DX), XWORD1
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
JMP avx2EcbSm4Done
avx2EcbSm4Single48:
VMOVDQU 16(DX), XWORD1
VMOVDQU 32(DX), XWORD2
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
VMOVDQU XWORD0, 0(BX)
VMOVDQU XWORD1, 16(BX)
VMOVDQU XWORD2, 32(BX)
avx2EcbSm4Done:
VZEROUPPER
RET

207
sm4/ecb_sm4_arm64.s Normal file
View File

@ -0,0 +1,207 @@
//go:build arm64 && !purego
// +build arm64,!purego
#include "textflag.h"
#include "textflag.h"
#define x V0
#define y V1
#define t0 V2
#define t1 V3
#define t2 V4
#define t3 V5
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define R16_MASK V27
#define R24_MASK V28
#define FK_MASK V29
#define XTMP6 V6
#define XTMP7 V7
#define t4 V10
#define t5 V11
#define t6 V12
#define t7 V13
#include "aesni_macros_arm64.s"
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
#define dstPtr R1
#define srcPtr R2
#define rk R3
#define rkSave R4
#define srcPtrLen R5
LOAD_SM4_AESNI_CONSTS()
VEOR ZERO.B16, ZERO.B16, ZERO.B16
MOVD xk+0(FP), rk
MOVD dst+8(FP), dstPtr
MOVD src+32(FP), srcPtr
MOVD src_len+40(FP), srcPtrLen
MOVD rk, rkSave
ecbSm4Octets:
CMP $128, srcPtrLen
BLT ecbSm4Nibbles
SUB $128, srcPtrLen
MOVD rkSave, rk
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
EOR R0, R0
ecb8BlocksLoop:
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
ADD $16, R0
CMP $128, R0
BNE ecb8BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VREV32 t4.B16, t4.B16
VREV32 t5.B16, t5.B16
VREV32 t6.B16, t6.B16
VREV32 t7.B16, t7.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr)
B ecbSm4Octets
ecbSm4Nibbles:
CMP $64, srcPtrLen
BLT ecbSm4Single
SUB $64, srcPtrLen
MOVD rkSave, rk
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
EOR R0, R0
ecb4BlocksLoop:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE ecb4BlocksLoop
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VREV32 t3.B16, t3.B16
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
ecbSm4Single:
CBZ srcPtrLen, ecbSm4Done
MOVD rkSave, rk
EOR R0, R0
CMP $16, srcPtrLen
BEQ ecbSm4Single16
CMP $32, srcPtrLen
BEQ ecbSm4Single32
CMP $48, srcPtrLen
BEQ ecbSm4Single48
ecbSm4Single16:
VLD1.P 16(srcPtr), [t0.S4]
VREV32 t0.B16, t0.B16
encryptBlocksLoop1:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop1
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VST1.P [t0.S4], 16(dstPtr)
B ecbSm4Done
ecbSm4Single32:
VLD1.P 32(srcPtr), [t0.S4, t1.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
encryptBlocksLoop2:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop2
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VST1.P [t0.S4, t1.S4], 32(dstPtr)
B ecbSm4Done
ecbSm4Single48:
VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4]
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
encryptBlocksLoop3:
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
ADD $16, R0
CMP $128, R0
BNE encryptBlocksLoop3
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
VREV32 t0.B16, t0.B16
VREV32 t1.B16, t1.B16
VREV32 t2.B16, t2.B16
VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr)
ecbSm4Done:
RET