mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 10:16:18 +08:00
sm4: cbc ecb mode enhancement
This commit is contained in:
parent
e00fbe696d
commit
4e50b3dd6b
@ -3,8 +3,10 @@ package cipher_test
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"testing"
|
||||
|
||||
"github.com/emmansun/gmsm/padding"
|
||||
@ -185,3 +187,24 @@ func TestCBCDecrypterSM4(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSM4CBCRandom(t *testing.T) {
|
||||
key := []byte("0123456789ABCDEF")
|
||||
c, err := sm4.NewCipher(key)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
encrypter := cipher.NewCBCEncrypter(c, key)
|
||||
decrypter := cipher.NewCBCDecrypter(c, key)
|
||||
for i:=1; i<=50; i++ {
|
||||
plaintext := make([]byte, i*16)
|
||||
ciphertext := make([]byte, i*16)
|
||||
got := make([]byte, i*16)
|
||||
io.ReadFull(rand.Reader, plaintext)
|
||||
encrypter.CryptBlocks(ciphertext, plaintext)
|
||||
decrypter.CryptBlocks(got, ciphertext)
|
||||
if !bytes.Equal(got, plaintext) {
|
||||
t.Errorf("test %v blocks failed", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -74,31 +74,7 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
||||
// Copy the last block of ciphertext in preparation as the new iv.
|
||||
copy(x.tmp, src[end-BlockSize:end])
|
||||
|
||||
|
||||
|
||||
decKeyPtr := &x.b.dec[0]
|
||||
|
||||
start := end - 2*x.b.blocksSize
|
||||
for start > 0 {
|
||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||
end = start
|
||||
start -= 2*x.b.blocksSize
|
||||
}
|
||||
|
||||
start = end - x.b.blocksSize
|
||||
for start > 0 {
|
||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
||||
end = start
|
||||
start -= x.b.blocksSize
|
||||
}
|
||||
|
||||
// Handle remain first blocks
|
||||
var temp []byte = make([]byte, x.b.blocksSize)
|
||||
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
||||
copy(batchSrc, x.iv)
|
||||
copy(batchSrc[BlockSize:], src[:end])
|
||||
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
|
||||
copy(dst, temp[:end])
|
||||
decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0])
|
||||
|
||||
// Set the new iv to the first block we copied earlier.
|
||||
x.iv, x.tmp = x.tmp, x.iv
|
||||
|
@ -78,7 +78,6 @@ done_sm4:
|
||||
|
||||
#define XDWTMP0 Y0
|
||||
#define XDWTMP1 Y1
|
||||
#define XDWTMP2 Y2
|
||||
|
||||
#define XDWORD0 Y4
|
||||
#define XDWORD1 Y5
|
||||
@ -110,6 +109,8 @@ done_sm4:
|
||||
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
||||
|
||||
#define BSWAP_MASK Y2
|
||||
|
||||
#define XDWORD Y8
|
||||
#define YDWORD Y9
|
||||
|
||||
@ -124,36 +125,22 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
MOVQ src_len+40(FP), DI
|
||||
MOVQ iv+56(FP), SI
|
||||
|
||||
LEAQ (DX)(DI*1), DX
|
||||
LEAQ (BX)(DI*1), BX
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
JE avx2
|
||||
JE avx2Start
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avx
|
||||
JE avxCbcSm4Octets
|
||||
|
||||
non_avx2_start:
|
||||
cbcSm4Octets:
|
||||
CMPQ DI, $128
|
||||
JEQ sse_8blocks
|
||||
JLE cbcSm4Nibbles
|
||||
SUBQ $128, DI
|
||||
LEAQ -128(DX), DX
|
||||
LEAQ -128(BX), BX
|
||||
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR 16(SI), XWORD1
|
||||
PXOR 32(SI), XWORD2
|
||||
PXOR 48(SI), XWORD3
|
||||
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
MOVUPS XWORD1, 16(BX)
|
||||
MOVUPS XWORD2, 32(BX)
|
||||
MOVUPS XWORD3, 48(BX)
|
||||
|
||||
RET
|
||||
|
||||
sse_8blocks:
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
@ -165,14 +152,14 @@ sse_8blocks:
|
||||
|
||||
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR 16(SI), XWORD1
|
||||
PXOR 32(SI), XWORD2
|
||||
PXOR 48(SI), XWORD3
|
||||
PXOR 64(SI), XWORD4
|
||||
PXOR 80(SI), XWORD5
|
||||
PXOR 96(SI), XWORD6
|
||||
PXOR 112(SI), XWORD7
|
||||
PXOR -16(DX), XWORD0
|
||||
PXOR 0(DX), XWORD1
|
||||
PXOR 16(DX), XWORD2
|
||||
PXOR 32(DX), XWORD3
|
||||
PXOR 48(DX), XWORD4
|
||||
PXOR 64(DX), XWORD5
|
||||
PXOR 80(DX), XWORD6
|
||||
PXOR 96(DX), XWORD7
|
||||
|
||||
MOVOU XWORD0, 0(BX)
|
||||
MOVOU XWORD1, 16(BX)
|
||||
@ -181,34 +168,110 @@ sse_8blocks:
|
||||
MOVOU XWORD4, 64(BX)
|
||||
MOVOU XWORD5, 80(BX)
|
||||
MOVOU XWORD6, 96(BX)
|
||||
MOVOU XWORD7, 112(BX)
|
||||
MOVOU XWORD7, 112(BX)
|
||||
|
||||
done_sm4:
|
||||
JMP cbcSm4Octets
|
||||
|
||||
cbcSm4Nibbles:
|
||||
CMPQ DI, $64
|
||||
JLE cbCSm4Single
|
||||
SUBQ $64, DI
|
||||
LEAQ -64(DX), DX
|
||||
LEAQ -64(BX), BX
|
||||
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
PXOR -16(DX), XWORD0
|
||||
PXOR 0(DX), XWORD1
|
||||
PXOR 16(DX), XWORD2
|
||||
PXOR 32(DX), XWORD3
|
||||
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
MOVUPS XWORD1, 16(BX)
|
||||
MOVUPS XWORD2, 32(BX)
|
||||
MOVUPS XWORD3, 48(BX)
|
||||
|
||||
cbCSm4Single:
|
||||
CMPQ DI, $16
|
||||
JEQ cbcSm4Single16
|
||||
|
||||
CMPQ DI, $32
|
||||
JEQ cbcSm4Single32
|
||||
|
||||
CMPQ DI, $48
|
||||
JEQ cbcSm4Single48
|
||||
|
||||
MOVOU -64(DX), XWORD0
|
||||
MOVOU -48(DX), XWORD1
|
||||
MOVOU -32(DX), XWORD2
|
||||
MOVOU -16(DX), XWORD3
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR -64(DX), XWORD1
|
||||
PXOR -48(DX), XWORD2
|
||||
PXOR -32(DX), XWORD3
|
||||
|
||||
MOVUPS XWORD0, -64(BX)
|
||||
MOVUPS XWORD1, -48(BX)
|
||||
MOVUPS XWORD2, -32(BX)
|
||||
MOVUPS XWORD3, -16(BX)
|
||||
JMP cbcSm4Done
|
||||
|
||||
cbcSm4Single16:
|
||||
MOVOU -16(DX), XWORD0
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
|
||||
MOVUPS XWORD0, -16(BX)
|
||||
JMP cbcSm4Done
|
||||
|
||||
cbcSm4Single32:
|
||||
MOVOU -32(DX), XWORD0
|
||||
MOVOU -16(DX), XWORD1
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR -32(DX), XWORD1
|
||||
|
||||
MOVUPS XWORD0, -32(BX)
|
||||
MOVUPS XWORD1, -16(BX)
|
||||
JMP cbcSm4Done
|
||||
|
||||
cbcSm4Single48:
|
||||
MOVOU -48(DX), XWORD0
|
||||
MOVOU -32(DX), XWORD1
|
||||
MOVOU -16(DX), XWORD2
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
PXOR 0(SI), XWORD0
|
||||
PXOR -48(DX), XWORD1
|
||||
PXOR -32(DX), XWORD2
|
||||
|
||||
MOVUPS XWORD0, -48(BX)
|
||||
MOVUPS XWORD1, -32(BX)
|
||||
MOVUPS XWORD2, -16(BX)
|
||||
|
||||
cbcSm4Done:
|
||||
RET
|
||||
|
||||
avx:
|
||||
avxCbcSm4Octets:
|
||||
CMPQ DI, $128
|
||||
JEQ avx_8blocks
|
||||
JLE avxCbcSm4Nibbles
|
||||
SUBQ $128, DI
|
||||
LEAQ -128(DX), DX
|
||||
LEAQ -128(BX), BX
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR 16(SI), XWORD1, XWORD1
|
||||
VPXOR 32(SI), XWORD2, XWORD2
|
||||
VPXOR 48(SI), XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
RET
|
||||
|
||||
avx_8blocks:
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
@ -219,15 +282,15 @@ avx_8blocks:
|
||||
VMOVDQU 112(DX), XWORD7
|
||||
|
||||
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR 16(SI), XWORD1, XWORD1
|
||||
VPXOR 32(SI), XWORD2, XWORD2
|
||||
VPXOR 48(SI), XWORD3, XWORD3
|
||||
VPXOR 64(SI), XWORD4, XWORD4
|
||||
VPXOR 80(SI), XWORD5, XWORD5
|
||||
VPXOR 96(SI), XWORD6, XWORD6
|
||||
VPXOR 112(SI), XWORD7, XWORD7
|
||||
|
||||
VPXOR -16(DX), XWORD0, XWORD0
|
||||
VPXOR 0(DX), XWORD1, XWORD1
|
||||
VPXOR 16(DX), XWORD2, XWORD2
|
||||
VPXOR 32(DX), XWORD3, XWORD3
|
||||
VPXOR 48(DX), XWORD4, XWORD4
|
||||
VPXOR 64(DX), XWORD5, XWORD5
|
||||
VPXOR 80(DX), XWORD6, XWORD6
|
||||
VPXOR 96(DX), XWORD7, XWORD7
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
@ -236,57 +299,115 @@ avx_8blocks:
|
||||
VMOVDQU XWORD4, 64(BX)
|
||||
VMOVDQU XWORD5, 80(BX)
|
||||
VMOVDQU XWORD6, 96(BX)
|
||||
VMOVDQU XWORD7, 112(BX)
|
||||
VMOVDQU XWORD7, 112(BX)
|
||||
|
||||
avx_sm4_done:
|
||||
JMP avxCbcSm4Octets
|
||||
|
||||
avxCbcSm4Nibbles:
|
||||
CMPQ DI, $64
|
||||
JLE avxCbCSm4Single
|
||||
SUBQ $64, DI
|
||||
LEAQ -64(DX), DX
|
||||
LEAQ -64(BX), BX
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR -16(DX), XWORD0, XWORD0
|
||||
VPXOR 0(DX), XWORD1, XWORD1
|
||||
VPXOR 16(DX), XWORD2, XWORD2
|
||||
VPXOR 32(DX), XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
|
||||
avxCbCSm4Single:
|
||||
CMPQ DI, $16
|
||||
JEQ avxCbcSm4Single16
|
||||
|
||||
CMPQ DI, $32
|
||||
JEQ avxCbcSm4Single32
|
||||
|
||||
CMPQ DI, $48
|
||||
JEQ avxCbcSm4Single48
|
||||
|
||||
VMOVDQU -64(DX), XWORD0
|
||||
VMOVDQU -48(DX), XWORD1
|
||||
VMOVDQU -32(DX), XWORD2
|
||||
VMOVDQU -16(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR -64(DX), XWORD1, XWORD1
|
||||
VPXOR -48(DX), XWORD2, XWORD2
|
||||
VPXOR -32(DX), XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, -64(BX)
|
||||
VMOVDQU XWORD1, -48(BX)
|
||||
VMOVDQU XWORD2, -32(BX)
|
||||
VMOVDQU XWORD3, -16(BX)
|
||||
JMP avxCbcSm4Done
|
||||
|
||||
avxCbcSm4Single16:
|
||||
VMOVDQU -16(DX), XWORD0
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
|
||||
VMOVDQU XWORD0, -16(BX)
|
||||
JMP avxCbcSm4Done
|
||||
|
||||
avxCbcSm4Single32:
|
||||
VMOVDQU -32(DX), XWORD0
|
||||
VMOVDQU -16(DX), XWORD1
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR -32(DX), XWORD1, XWORD1
|
||||
|
||||
VMOVDQU XWORD0, -32(BX)
|
||||
VMOVDQU XWORD1, -16(BX)
|
||||
JMP avxCbcSm4Done
|
||||
|
||||
avxCbcSm4Single48:
|
||||
VMOVDQU -48(DX), XWORD0
|
||||
VMOVDQU -32(DX), XWORD1
|
||||
VMOVDQU -16(DX), XWORD2
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR -48(DX), XWORD1, XWORD1
|
||||
VPXOR -32(DX), XWORD2, XWORD2
|
||||
|
||||
VMOVDQU XWORD0, -48(BX)
|
||||
VMOVDQU XWORD1, -32(BX)
|
||||
VMOVDQU XWORD2, -16(BX)
|
||||
|
||||
avxCbcSm4Done:
|
||||
RET
|
||||
|
||||
avx2:
|
||||
avx2Start:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
CMPQ DI, $256
|
||||
JEQ avx2_16blocks
|
||||
|
||||
avx2_8blocks:
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
VPXOR 0(SI), XDWORD0, XDWORD0
|
||||
VPXOR 32(SI), XDWORD1, XDWORD1
|
||||
VPXOR 64(SI), XDWORD2, XDWORD2
|
||||
VPXOR 96(SI), XDWORD3, XDWORD3
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
|
||||
|
||||
avx2_16blocks:
|
||||
CMPQ DI, $256
|
||||
JLE avx2CbcSm4Octets
|
||||
SUBQ $256, DI
|
||||
LEAQ -256(DX), DX
|
||||
LEAQ -256(BX), BX
|
||||
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
@ -296,8 +417,6 @@ avx2_16blocks:
|
||||
VMOVDQU 192(DX), XDWORD6
|
||||
VMOVDQU 224(DX), XDWORD7
|
||||
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
@ -309,33 +428,32 @@ avx2_16blocks:
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||
|
||||
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||
|
||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
VPXOR 0(SI), XDWORD0, XDWORD0
|
||||
VPXOR 32(SI), XDWORD1, XDWORD1
|
||||
VPXOR 64(SI), XDWORD2, XDWORD2
|
||||
VPXOR 96(SI), XDWORD3, XDWORD3
|
||||
VPXOR 128(SI), XDWORD4, XDWORD4
|
||||
VPXOR 160(SI), XDWORD5, XDWORD5
|
||||
VPXOR 192(SI), XDWORD6, XDWORD6
|
||||
VPXOR 224(SI), XDWORD7, XDWORD7
|
||||
VPXOR -16(DX), XDWORD0, XDWORD0
|
||||
VPXOR 16(DX), XDWORD1, XDWORD1
|
||||
VPXOR 48(DX), XDWORD2, XDWORD2
|
||||
VPXOR 80(DX), XDWORD3, XDWORD3
|
||||
VPXOR 112(DX), XDWORD4, XDWORD4
|
||||
VPXOR 144(DX), XDWORD5, XDWORD5
|
||||
VPXOR 176(DX), XDWORD6, XDWORD6
|
||||
VPXOR 208(DX), XDWORD7, XDWORD7
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
@ -346,6 +464,141 @@ avx2_16blocks:
|
||||
VMOVDQU XDWORD6, 192(BX)
|
||||
VMOVDQU XDWORD7, 224(BX)
|
||||
|
||||
avx2_sm4_done:
|
||||
JMP avx2_16blocks
|
||||
|
||||
avx2CbcSm4Octets:
|
||||
CMPQ DI, $128
|
||||
JLE avx2CbcSm4Nibbles
|
||||
SUBQ $128, DI
|
||||
LEAQ -128(DX), DX
|
||||
LEAQ -128(BX), BX
|
||||
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
|
||||
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
|
||||
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
VPXOR -16(DX), XDWORD0, XDWORD0
|
||||
VPXOR 16(DX), XDWORD1, XDWORD1
|
||||
VPXOR 48(DX), XDWORD2, XDWORD2
|
||||
VPXOR 80(DX), XDWORD3, XDWORD3
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
|
||||
JMP avx2CbcSm4Octets
|
||||
|
||||
avx2CbcSm4Nibbles:
|
||||
CMPQ DI, $64
|
||||
JLE avx2CbCSm4Single
|
||||
SUBQ $64, DI
|
||||
LEAQ -64(DX), DX
|
||||
LEAQ -64(BX), BX
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR -16(DX), XWORD0, XWORD0
|
||||
VPXOR 0(DX), XWORD1, XWORD1
|
||||
VPXOR 16(DX), XWORD2, XWORD2
|
||||
VPXOR 32(DX), XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
|
||||
avx2CbCSm4Single:
|
||||
CMPQ DI, $16
|
||||
JEQ avx2CbcSm4Single16
|
||||
|
||||
CMPQ DI, $32
|
||||
JEQ avx2CbcSm4Single32
|
||||
|
||||
CMPQ DI, $48
|
||||
JEQ avx2CbcSm4Single48
|
||||
|
||||
VMOVDQU -64(DX), XWORD0
|
||||
VMOVDQU -48(DX), XWORD1
|
||||
VMOVDQU -32(DX), XWORD2
|
||||
VMOVDQU -16(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR -64(DX), XWORD1, XWORD1
|
||||
VPXOR -48(DX), XWORD2, XWORD2
|
||||
VPXOR -32(DX), XWORD3, XWORD3
|
||||
|
||||
VMOVDQU XWORD0, -64(BX)
|
||||
VMOVDQU XWORD1, -48(BX)
|
||||
VMOVDQU XWORD2, -32(BX)
|
||||
VMOVDQU XWORD3, -16(BX)
|
||||
JMP avx2CbcSm4Done
|
||||
|
||||
avx2CbcSm4Single16:
|
||||
VMOVDQU -16(DX), XWORD0
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
|
||||
VMOVDQU XWORD0, -16(BX)
|
||||
JMP avx2CbcSm4Done
|
||||
|
||||
avx2CbcSm4Single32:
|
||||
VMOVDQU -32(DX), XWORD0
|
||||
VMOVDQU -16(DX), XWORD1
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR -32(DX), XWORD1, XWORD1
|
||||
|
||||
VMOVDQU XWORD0, -32(BX)
|
||||
VMOVDQU XWORD1, -16(BX)
|
||||
JMP avx2CbcSm4Done
|
||||
|
||||
avx2CbcSm4Single48:
|
||||
VMOVDQU -48(DX), XWORD0
|
||||
VMOVDQU -32(DX), XWORD1
|
||||
VMOVDQU -16(DX), XWORD2
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VPXOR 0(SI), XWORD0, XWORD0
|
||||
VPXOR -48(DX), XWORD1, XWORD1
|
||||
VPXOR -32(DX), XWORD2, XWORD2
|
||||
|
||||
VMOVDQU XWORD0, -48(BX)
|
||||
VMOVDQU XWORD1, -32(BX)
|
||||
VMOVDQU XWORD2, -16(BX)
|
||||
|
||||
avx2CbcSm4Done:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
@ -93,56 +93,35 @@ done_sm4:
|
||||
#define t6 V12
|
||||
#define t7 V13
|
||||
|
||||
#define dstPtr R1
|
||||
#define srcPtr R2
|
||||
#define rk R3
|
||||
#define rkSave R4
|
||||
#define srcPtrLen R5
|
||||
#define IV V18
|
||||
|
||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
|
||||
MOVD xk+0(FP), R8
|
||||
MOVD dst+8(FP), R9
|
||||
MOVD src+32(FP), R10
|
||||
MOVD src_len+40(FP), R12
|
||||
MOVD iv+56(FP), R11
|
||||
|
||||
CMP $128, R12
|
||||
BEQ double_dec
|
||||
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
EOR R0, R0
|
||||
|
||||
encryptBlocksLoop:
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
MOVD xk+0(FP), rk
|
||||
MOVD dst+8(FP), dstPtr
|
||||
MOVD src+32(FP), srcPtr
|
||||
MOVD src_len+40(FP), srcPtrLen
|
||||
MOVD iv+56(FP), R6
|
||||
MOVD rk, rkSave
|
||||
VLD1 (R6), [IV]
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop
|
||||
cbcSm4Octets:
|
||||
CMP $128, srcPtrLen
|
||||
BLE cbcSm4Nibbles
|
||||
SUB $128, srcPtrLen
|
||||
MOVD rkSave, rk
|
||||
ADD srcPtr, srcPtrLen, R10
|
||||
SUB $16, R10, R11
|
||||
ADD dstPtr, srcPtrLen, R12
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||
|
||||
VEOR V6.B16, t0.B16, t0.B16
|
||||
VEOR V7.B16, t1.B16, t1.B16
|
||||
VEOR V8.B16, t2.B16, t2.B16
|
||||
VEOR V9.B16, t3.B16, t3.B16
|
||||
|
||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
||||
RET
|
||||
|
||||
double_dec:
|
||||
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
@ -152,22 +131,21 @@ double_dec:
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
EOR R0, R0
|
||||
|
||||
decrypt8BlocksLoop:
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||
cbc8BlocksLoop:
|
||||
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE decrypt8BlocksLoop
|
||||
BNE cbc8BlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
@ -192,7 +170,180 @@ decrypt8BlocksLoop:
|
||||
VEOR V8.B16, t6.B16, t6.B16
|
||||
VEOR V9.B16, t7.B16, t7.B16
|
||||
|
||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
|
||||
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
|
||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R12)
|
||||
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R12)
|
||||
|
||||
B cbcSm4Octets
|
||||
|
||||
cbcSm4Nibbles:
|
||||
CMP $64, srcPtrLen
|
||||
BLE cbcSm4Single
|
||||
SUB $64, srcPtrLen
|
||||
MOVD rkSave, rk
|
||||
ADD srcPtr, srcPtrLen, R10
|
||||
SUB $16, R10, R11
|
||||
ADD dstPtr, srcPtrLen, R12
|
||||
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
EOR R0, R0
|
||||
|
||||
cbc4BlocksLoop:
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE cbc4BlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||
VEOR V6.B16, t0.B16, t0.B16
|
||||
VEOR V7.B16, t1.B16, t1.B16
|
||||
VEOR V8.B16, t2.B16, t2.B16
|
||||
VEOR V9.B16, t3.B16, t3.B16
|
||||
|
||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12)
|
||||
|
||||
cbcSm4Single:
|
||||
MOVD rkSave, rk
|
||||
EOR R0, R0
|
||||
MOVD srcPtr, R10
|
||||
|
||||
CMP $16, srcPtrLen
|
||||
BEQ cbcSm4Single16
|
||||
|
||||
CMP $32, srcPtrLen
|
||||
BEQ cbcSm4Single32
|
||||
|
||||
CMP $48, srcPtrLen
|
||||
BEQ cbcSm4Single48
|
||||
|
||||
// 4 blocks
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
cbc4BlocksLoop64:
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE cbc4BlocksLoop64
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
|
||||
VLD1 (srcPtr), [V6.S4, V7.S4, V8.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VEOR V6.B16, t1.B16, t1.B16
|
||||
VEOR V7.B16, t2.B16, t2.B16
|
||||
VEOR V8.B16, t3.B16, t3.B16
|
||||
|
||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
|
||||
|
||||
B cbcSm4Done
|
||||
|
||||
cbcSm4Single16:
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
cbc4BlocksLoop16:
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE cbc4BlocksLoop16
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
|
||||
VST1 [t0.S4], (dstPtr)
|
||||
|
||||
B cbcSm4Done
|
||||
|
||||
cbcSm4Single32:
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
cbc4BlocksLoop32:
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE cbc4BlocksLoop32
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
|
||||
VLD1 (srcPtr), [V6.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VEOR V6.B16, t1.B16, t1.B16
|
||||
|
||||
VST1 [t0.S4, t1.S4], (dstPtr)
|
||||
B cbcSm4Done
|
||||
|
||||
cbcSm4Single48:
|
||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
cbc4BlocksLoop48:
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE cbc4BlocksLoop48
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
|
||||
VLD1 (srcPtr), [V6.S4, V7.S4]
|
||||
VEOR IV.B16, t0.B16, t0.B16
|
||||
VEOR V6.B16, t1.B16, t1.B16
|
||||
VEOR V7.B16, t2.B16, t2.B16
|
||||
|
||||
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
|
||||
|
||||
cbcSm4Done:
|
||||
RET
|
||||
|
@ -49,43 +49,17 @@ func (b *sm4CipherAsm) NewECBDecrypter() cipher.BlockMode {
|
||||
|
||||
func (x *ecb) BlockSize() int { return BlockSize }
|
||||
|
||||
//go:noescape
|
||||
func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||
|
||||
func (x *ecb) CryptBlocks(dst, src []byte) {
|
||||
x.validate(dst, src)
|
||||
if len(src) == 0 {
|
||||
return
|
||||
}
|
||||
for len(src) >= 2*x.b.blocksSize {
|
||||
if x.enc == ecbEncrypt {
|
||||
x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
||||
} else {
|
||||
x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
||||
}
|
||||
src = src[2*x.b.blocksSize:]
|
||||
dst = dst[2*x.b.blocksSize:]
|
||||
}
|
||||
for len(src) >= x.b.blocksSize {
|
||||
if x.enc == ecbEncrypt {
|
||||
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
||||
} else {
|
||||
x.b.DecryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
||||
}
|
||||
src = src[x.b.blocksSize:]
|
||||
dst = dst[x.b.blocksSize:]
|
||||
}
|
||||
if len(src) > BlockSize {
|
||||
temp := make([]byte, x.b.blocksSize)
|
||||
copy(temp, src)
|
||||
if x.enc == ecbEncrypt {
|
||||
x.b.EncryptBlocks(temp, temp)
|
||||
} else {
|
||||
x.b.DecryptBlocks(temp, temp)
|
||||
}
|
||||
copy(dst, temp[:len(src)])
|
||||
} else if len(src) > 0 {
|
||||
if x.enc == ecbEncrypt {
|
||||
x.b.Encrypt(dst, src)
|
||||
} else {
|
||||
x.b.Decrypt(dst, src)
|
||||
}
|
||||
xk := &x.b.enc[0]
|
||||
if x.enc == ecbDecrypt {
|
||||
xk = &x.b.dec[0]
|
||||
}
|
||||
encryptSm4Ecb(xk, dst, src)
|
||||
}
|
||||
|
371
sm4/ecb_sm4_amd64.s
Normal file
371
sm4/ecb_sm4_amd64.s
Normal file
@ -0,0 +1,371 @@
|
||||
//go:build amd64 && !purego
|
||||
// +build amd64,!purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "aesni_macros_amd64.s"
|
||||
|
||||
#define XDWTMP0 Y0
|
||||
#define XDWTMP1 Y1
|
||||
|
||||
#define XDWORD0 Y4
|
||||
#define XDWORD1 Y5
|
||||
#define XDWORD2 Y6
|
||||
#define XDWORD3 Y7
|
||||
|
||||
#define XDWORD4 Y10
|
||||
#define XDWORD5 Y11
|
||||
#define XDWORD6 Y12
|
||||
#define XDWORD7 Y14
|
||||
|
||||
#define XWTMP0 X0
|
||||
#define XWTMP1 X1
|
||||
#define XWTMP2 X2
|
||||
|
||||
#define XWORD0 X4
|
||||
#define XWORD1 X5
|
||||
#define XWORD2 X6
|
||||
#define XWORD3 X7
|
||||
|
||||
#define XWORD4 X10
|
||||
#define XWORD5 X11
|
||||
#define XWORD6 X12
|
||||
#define XWORD7 X14
|
||||
|
||||
#define NIBBLE_MASK Y3
|
||||
#define X_NIBBLE_MASK X3
|
||||
|
||||
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
||||
|
||||
#define BSWAP_MASK Y2
|
||||
|
||||
#define XDWORD Y8
|
||||
#define YDWORD Y9
|
||||
|
||||
#define XWORD X8
|
||||
#define YWORD X9
|
||||
|
||||
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
|
||||
MOVQ xk+0(FP), AX
|
||||
MOVQ dst+8(FP), BX
|
||||
MOVQ src+32(FP), DX
|
||||
MOVQ src_len+40(FP), DI
|
||||
|
||||
CMPB ·useAVX2(SB), $1
|
||||
JE avx2_start
|
||||
|
||||
CMPB ·useAVX(SB), $1
|
||||
JE avxEcbSm4Octets
|
||||
|
||||
ecbSm4Octets:
|
||||
CMPQ DI, $128
|
||||
JB ecbSm4Nibbles
|
||||
SUBQ $128, DI
|
||||
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
MOVOU 64(DX), XWORD4
|
||||
MOVOU 80(DX), XWORD5
|
||||
MOVOU 96(DX), XWORD6
|
||||
MOVOU 112(DX), XWORD7
|
||||
|
||||
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
MOVOU XWORD0, 0(BX)
|
||||
MOVOU XWORD1, 16(BX)
|
||||
MOVOU XWORD2, 32(BX)
|
||||
MOVOU XWORD3, 48(BX)
|
||||
MOVOU XWORD4, 64(BX)
|
||||
MOVOU XWORD5, 80(BX)
|
||||
MOVOU XWORD6, 96(BX)
|
||||
MOVOU XWORD7, 112(BX)
|
||||
|
||||
LEAQ 128(BX), BX
|
||||
LEAQ 128(DX), DX
|
||||
JMP ecbSm4Octets
|
||||
|
||||
ecbSm4Nibbles:
|
||||
CMPQ DI, $64
|
||||
JB ecbSm4Single
|
||||
SUBQ $64, DI
|
||||
|
||||
MOVOU 0(DX), XWORD0
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
MOVOU 48(DX), XWORD3
|
||||
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
MOVUPS XWORD1, 16(BX)
|
||||
MOVUPS XWORD2, 32(BX)
|
||||
MOVUPS XWORD3, 48(BX)
|
||||
|
||||
LEAQ 64(BX), BX
|
||||
LEAQ 64(DX), DX
|
||||
|
||||
ecbSm4Single:
|
||||
TESTQ DI, DI
|
||||
JE ecbSm4Done
|
||||
|
||||
MOVOU 0(DX), XWORD0
|
||||
CMPQ DI, $32
|
||||
JEQ ecbSm4Single32
|
||||
CMPQ DI, $48
|
||||
JEQ ecbSm4Single48
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
JMP ecbSm4Done
|
||||
|
||||
ecbSm4Single32:
|
||||
MOVOU 16(DX), XWORD1
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
MOVUPS XWORD1, 16(BX)
|
||||
JMP ecbSm4Done
|
||||
|
||||
ecbSm4Single48:
|
||||
MOVOU 16(DX), XWORD1
|
||||
MOVOU 32(DX), XWORD2
|
||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
MOVUPS XWORD0, 0(BX)
|
||||
MOVUPS XWORD1, 16(BX)
|
||||
MOVUPS XWORD2, 32(BX)
|
||||
|
||||
ecbSm4Done:
|
||||
RET
|
||||
|
||||
avxEcbSm4Octets:
|
||||
CMPQ DI, $128
|
||||
JB avxEcbSm4Nibbles
|
||||
SUBQ $128, DI
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
VMOVDQU 64(DX), XWORD4
|
||||
VMOVDQU 80(DX), XWORD5
|
||||
VMOVDQU 96(DX), XWORD6
|
||||
VMOVDQU 112(DX), XWORD7
|
||||
|
||||
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
VMOVDQU XWORD4, 64(BX)
|
||||
VMOVDQU XWORD5, 80(BX)
|
||||
VMOVDQU XWORD6, 96(BX)
|
||||
VMOVDQU XWORD7, 112(BX)
|
||||
|
||||
LEAQ 128(BX), BX
|
||||
LEAQ 128(DX), DX
|
||||
JMP avxEcbSm4Octets
|
||||
|
||||
avxEcbSm4Nibbles:
|
||||
CMPQ DI, $64
|
||||
JB avxEcbSm4Single
|
||||
SUBQ $64, DI
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
|
||||
LEAQ 64(BX), BX
|
||||
LEAQ 64(DX), DX
|
||||
|
||||
avxEcbSm4Single:
|
||||
TESTQ DI, DI
|
||||
JE avxEcbSm4Done
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
CMPQ DI, $32
|
||||
JEQ avxEcbSm4Single32
|
||||
CMPQ DI, $48
|
||||
JEQ avxEcbSm4Single48
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
JMP avxEcbSm4Done
|
||||
|
||||
avxEcbSm4Single32:
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
JMP avxEcbSm4Done
|
||||
|
||||
avxEcbSm4Single48:
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
|
||||
avxEcbSm4Done:
|
||||
RET
|
||||
|
||||
avx2_start:
|
||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
|
||||
|
||||
avx2_16blocks:
|
||||
CMPQ DI, $256
|
||||
JB avx2EcbSm4Octets
|
||||
SUBQ $256, DI
|
||||
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
VMOVDQU 128(DX), XDWORD4
|
||||
VMOVDQU 160(DX), XDWORD5
|
||||
VMOVDQU 192(DX), XDWORD6
|
||||
VMOVDQU 224(DX), XDWORD7
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||
|
||||
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||
|
||||
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||
VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
|
||||
VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
|
||||
VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
|
||||
VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
VMOVDQU XDWORD4, 128(BX)
|
||||
VMOVDQU XDWORD5, 160(BX)
|
||||
VMOVDQU XDWORD6, 192(BX)
|
||||
VMOVDQU XDWORD7, 224(BX)
|
||||
|
||||
LEAQ 256(BX), BX
|
||||
LEAQ 256(DX), DX
|
||||
JMP avx2_16blocks
|
||||
|
||||
avx2EcbSm4Octets:
|
||||
CMPQ DI, $128
|
||||
JB avx2EcbSm4Nibbles
|
||||
SUBQ $128, DI
|
||||
|
||||
VMOVDQU 0(DX), XDWORD0
|
||||
VMOVDQU 32(DX), XDWORD1
|
||||
VMOVDQU 64(DX), XDWORD2
|
||||
VMOVDQU 96(DX), XDWORD3
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
|
||||
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
|
||||
// Transpose matrix 4 x 4 32bits word
|
||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||
|
||||
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||
|
||||
VMOVDQU XDWORD0, 0(BX)
|
||||
VMOVDQU XDWORD1, 32(BX)
|
||||
VMOVDQU XDWORD2, 64(BX)
|
||||
VMOVDQU XDWORD3, 96(BX)
|
||||
|
||||
LEAQ 128(BX), BX
|
||||
LEAQ 128(DX), DX
|
||||
JMP avx2EcbSm4Octets
|
||||
|
||||
avx2EcbSm4Nibbles:
|
||||
CMPQ DI, $64
|
||||
JB avx2EcbSm4Single
|
||||
SUBQ $64, DI
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
VMOVDQU 48(DX), XWORD3
|
||||
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
VMOVDQU XWORD3, 48(BX)
|
||||
|
||||
LEAQ 64(BX), BX
|
||||
LEAQ 64(DX), DX
|
||||
|
||||
avx2EcbSm4Single:
|
||||
TESTQ DI, DI
|
||||
JE avx2EcbSm4Done
|
||||
|
||||
VMOVDQU 0(DX), XWORD0
|
||||
CMPQ DI, $32
|
||||
JEQ avx2EcbSm4Single32
|
||||
CMPQ DI, $48
|
||||
JEQ avx2EcbSm4Single48
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
JMP avx2EcbSm4Done
|
||||
|
||||
avx2EcbSm4Single32:
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
JMP avx2EcbSm4Done
|
||||
|
||||
avx2EcbSm4Single48:
|
||||
VMOVDQU 16(DX), XWORD1
|
||||
VMOVDQU 32(DX), XWORD2
|
||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||
VMOVDQU XWORD0, 0(BX)
|
||||
VMOVDQU XWORD1, 16(BX)
|
||||
VMOVDQU XWORD2, 32(BX)
|
||||
|
||||
avx2EcbSm4Done:
|
||||
VZEROUPPER
|
||||
RET
|
207
sm4/ecb_sm4_arm64.s
Normal file
207
sm4/ecb_sm4_arm64.s
Normal file
@ -0,0 +1,207 @@
|
||||
//go:build arm64 && !purego
|
||||
// +build arm64,!purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define x V0
|
||||
#define y V1
|
||||
#define t0 V2
|
||||
#define t1 V3
|
||||
#define t2 V4
|
||||
#define t3 V5
|
||||
#define ZERO V16
|
||||
#define NIBBLE_MASK V20
|
||||
#define INVERSE_SHIFT_ROWS V21
|
||||
#define M1L V22
|
||||
#define M1H V23
|
||||
#define M2L V24
|
||||
#define M2H V25
|
||||
#define R08_MASK V26
|
||||
#define R16_MASK V27
|
||||
#define R24_MASK V28
|
||||
#define FK_MASK V29
|
||||
#define XTMP6 V6
|
||||
#define XTMP7 V7
|
||||
#define t4 V10
|
||||
#define t5 V11
|
||||
#define t6 V12
|
||||
#define t7 V13
|
||||
|
||||
#include "aesni_macros_arm64.s"
|
||||
|
||||
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
|
||||
#define dstPtr R1
|
||||
#define srcPtr R2
|
||||
#define rk R3
|
||||
#define rkSave R4
|
||||
#define srcPtrLen R5
|
||||
LOAD_SM4_AESNI_CONSTS()
|
||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||
|
||||
MOVD xk+0(FP), rk
|
||||
MOVD dst+8(FP), dstPtr
|
||||
MOVD src+32(FP), srcPtr
|
||||
MOVD src_len+40(FP), srcPtrLen
|
||||
MOVD rk, rkSave
|
||||
|
||||
ecbSm4Octets:
|
||||
CMP $128, srcPtrLen
|
||||
BLT ecbSm4Nibbles
|
||||
SUB $128, srcPtrLen
|
||||
MOVD rkSave, rk
|
||||
|
||||
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
|
||||
EOR R0, R0
|
||||
|
||||
ecb8BlocksLoop:
|
||||
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE ecb8BlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VREV32 t4.B16, t4.B16
|
||||
VREV32 t5.B16, t5.B16
|
||||
VREV32 t6.B16, t6.B16
|
||||
VREV32 t7.B16, t7.B16
|
||||
|
||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
|
||||
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr)
|
||||
|
||||
B ecbSm4Octets
|
||||
|
||||
ecbSm4Nibbles:
|
||||
CMP $64, srcPtrLen
|
||||
BLT ecbSm4Single
|
||||
SUB $64, srcPtrLen
|
||||
MOVD rkSave, rk
|
||||
|
||||
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
|
||||
EOR R0, R0
|
||||
|
||||
ecb4BlocksLoop:
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE ecb4BlocksLoop
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VREV32 t3.B16, t3.B16
|
||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
|
||||
|
||||
ecbSm4Single:
|
||||
CBZ srcPtrLen, ecbSm4Done
|
||||
MOVD rkSave, rk
|
||||
EOR R0, R0
|
||||
|
||||
CMP $16, srcPtrLen
|
||||
BEQ ecbSm4Single16
|
||||
|
||||
CMP $32, srcPtrLen
|
||||
BEQ ecbSm4Single32
|
||||
|
||||
CMP $48, srcPtrLen
|
||||
BEQ ecbSm4Single48
|
||||
|
||||
ecbSm4Single16:
|
||||
VLD1.P 16(srcPtr), [t0.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
|
||||
encryptBlocksLoop1:
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop1
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VST1.P [t0.S4], 16(dstPtr)
|
||||
|
||||
B ecbSm4Done
|
||||
|
||||
ecbSm4Single32:
|
||||
VLD1.P 32(srcPtr), [t0.S4, t1.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
|
||||
encryptBlocksLoop2:
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop2
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VST1.P [t0.S4, t1.S4], 32(dstPtr)
|
||||
|
||||
B ecbSm4Done
|
||||
|
||||
ecbSm4Single48:
|
||||
VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4]
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
|
||||
encryptBlocksLoop3:
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||
|
||||
ADD $16, R0
|
||||
CMP $128, R0
|
||||
BNE encryptBlocksLoop3
|
||||
|
||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||
VREV32 t0.B16, t0.B16
|
||||
VREV32 t1.B16, t1.B16
|
||||
VREV32 t2.B16, t2.B16
|
||||
VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr)
|
||||
ecbSm4Done:
|
||||
RET
|
Loading…
x
Reference in New Issue
Block a user