mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
sm4: cbc ecb mode enhancement
This commit is contained in:
parent
e00fbe696d
commit
4e50b3dd6b
@ -3,8 +3,10 @@ package cipher_test
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"crypto/cipher"
|
"crypto/cipher"
|
||||||
|
"crypto/rand"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/emmansun/gmsm/padding"
|
"github.com/emmansun/gmsm/padding"
|
||||||
@ -185,3 +187,24 @@ func TestCBCDecrypterSM4(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSM4CBCRandom(t *testing.T) {
|
||||||
|
key := []byte("0123456789ABCDEF")
|
||||||
|
c, err := sm4.NewCipher(key)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
encrypter := cipher.NewCBCEncrypter(c, key)
|
||||||
|
decrypter := cipher.NewCBCDecrypter(c, key)
|
||||||
|
for i:=1; i<=50; i++ {
|
||||||
|
plaintext := make([]byte, i*16)
|
||||||
|
ciphertext := make([]byte, i*16)
|
||||||
|
got := make([]byte, i*16)
|
||||||
|
io.ReadFull(rand.Reader, plaintext)
|
||||||
|
encrypter.CryptBlocks(ciphertext, plaintext)
|
||||||
|
decrypter.CryptBlocks(got, ciphertext)
|
||||||
|
if !bytes.Equal(got, plaintext) {
|
||||||
|
t.Errorf("test %v blocks failed", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -74,31 +74,7 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
|
|||||||
// Copy the last block of ciphertext in preparation as the new iv.
|
// Copy the last block of ciphertext in preparation as the new iv.
|
||||||
copy(x.tmp, src[end-BlockSize:end])
|
copy(x.tmp, src[end-BlockSize:end])
|
||||||
|
|
||||||
|
decryptBlocksChain(&x.b.dec[0], dst, src, &x.iv[0])
|
||||||
|
|
||||||
decKeyPtr := &x.b.dec[0]
|
|
||||||
|
|
||||||
start := end - 2*x.b.blocksSize
|
|
||||||
for start > 0 {
|
|
||||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
|
||||||
end = start
|
|
||||||
start -= 2*x.b.blocksSize
|
|
||||||
}
|
|
||||||
|
|
||||||
start = end - x.b.blocksSize
|
|
||||||
for start > 0 {
|
|
||||||
decryptBlocksChain(decKeyPtr, dst[start:end], src[start:end], &src[start-BlockSize])
|
|
||||||
end = start
|
|
||||||
start -= x.b.blocksSize
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle remain first blocks
|
|
||||||
var temp []byte = make([]byte, x.b.blocksSize)
|
|
||||||
var batchSrc []byte = make([]byte, x.b.blocksSize+BlockSize)
|
|
||||||
copy(batchSrc, x.iv)
|
|
||||||
copy(batchSrc[BlockSize:], src[:end])
|
|
||||||
decryptBlocksChain(decKeyPtr, temp, batchSrc[BlockSize:], &batchSrc[0])
|
|
||||||
copy(dst, temp[:end])
|
|
||||||
|
|
||||||
// Set the new iv to the first block we copied earlier.
|
// Set the new iv to the first block we copied earlier.
|
||||||
x.iv, x.tmp = x.tmp, x.iv
|
x.iv, x.tmp = x.tmp, x.iv
|
||||||
|
@ -78,7 +78,6 @@ done_sm4:
|
|||||||
|
|
||||||
#define XDWTMP0 Y0
|
#define XDWTMP0 Y0
|
||||||
#define XDWTMP1 Y1
|
#define XDWTMP1 Y1
|
||||||
#define XDWTMP2 Y2
|
|
||||||
|
|
||||||
#define XDWORD0 Y4
|
#define XDWORD0 Y4
|
||||||
#define XDWORD1 Y5
|
#define XDWORD1 Y5
|
||||||
@ -110,6 +109,8 @@ done_sm4:
|
|||||||
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||||
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
||||||
|
|
||||||
|
#define BSWAP_MASK Y2
|
||||||
|
|
||||||
#define XDWORD Y8
|
#define XDWORD Y8
|
||||||
#define YDWORD Y9
|
#define YDWORD Y9
|
||||||
|
|
||||||
@ -124,36 +125,22 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
|||||||
MOVQ src_len+40(FP), DI
|
MOVQ src_len+40(FP), DI
|
||||||
MOVQ iv+56(FP), SI
|
MOVQ iv+56(FP), SI
|
||||||
|
|
||||||
|
LEAQ (DX)(DI*1), DX
|
||||||
|
LEAQ (BX)(DI*1), BX
|
||||||
|
|
||||||
CMPB ·useAVX2(SB), $1
|
CMPB ·useAVX2(SB), $1
|
||||||
JE avx2
|
JE avx2Start
|
||||||
|
|
||||||
CMPB ·useAVX(SB), $1
|
CMPB ·useAVX(SB), $1
|
||||||
JE avx
|
JE avxCbcSm4Octets
|
||||||
|
|
||||||
non_avx2_start:
|
cbcSm4Octets:
|
||||||
CMPQ DI, $128
|
CMPQ DI, $128
|
||||||
JEQ sse_8blocks
|
JLE cbcSm4Nibbles
|
||||||
|
SUBQ $128, DI
|
||||||
|
LEAQ -128(DX), DX
|
||||||
|
LEAQ -128(BX), BX
|
||||||
|
|
||||||
MOVOU 0(DX), XWORD0
|
|
||||||
MOVOU 16(DX), XWORD1
|
|
||||||
MOVOU 32(DX), XWORD2
|
|
||||||
MOVOU 48(DX), XWORD3
|
|
||||||
|
|
||||||
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
|
||||||
|
|
||||||
PXOR 0(SI), XWORD0
|
|
||||||
PXOR 16(SI), XWORD1
|
|
||||||
PXOR 32(SI), XWORD2
|
|
||||||
PXOR 48(SI), XWORD3
|
|
||||||
|
|
||||||
MOVUPS XWORD0, 0(BX)
|
|
||||||
MOVUPS XWORD1, 16(BX)
|
|
||||||
MOVUPS XWORD2, 32(BX)
|
|
||||||
MOVUPS XWORD3, 48(BX)
|
|
||||||
|
|
||||||
RET
|
|
||||||
|
|
||||||
sse_8blocks:
|
|
||||||
MOVOU 0(DX), XWORD0
|
MOVOU 0(DX), XWORD0
|
||||||
MOVOU 16(DX), XWORD1
|
MOVOU 16(DX), XWORD1
|
||||||
MOVOU 32(DX), XWORD2
|
MOVOU 32(DX), XWORD2
|
||||||
@ -165,14 +152,14 @@ sse_8blocks:
|
|||||||
|
|
||||||
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
PXOR 0(SI), XWORD0
|
PXOR -16(DX), XWORD0
|
||||||
PXOR 16(SI), XWORD1
|
PXOR 0(DX), XWORD1
|
||||||
PXOR 32(SI), XWORD2
|
PXOR 16(DX), XWORD2
|
||||||
PXOR 48(SI), XWORD3
|
PXOR 32(DX), XWORD3
|
||||||
PXOR 64(SI), XWORD4
|
PXOR 48(DX), XWORD4
|
||||||
PXOR 80(SI), XWORD5
|
PXOR 64(DX), XWORD5
|
||||||
PXOR 96(SI), XWORD6
|
PXOR 80(DX), XWORD6
|
||||||
PXOR 112(SI), XWORD7
|
PXOR 96(DX), XWORD7
|
||||||
|
|
||||||
MOVOU XWORD0, 0(BX)
|
MOVOU XWORD0, 0(BX)
|
||||||
MOVOU XWORD1, 16(BX)
|
MOVOU XWORD1, 16(BX)
|
||||||
@ -183,32 +170,108 @@ sse_8blocks:
|
|||||||
MOVOU XWORD6, 96(BX)
|
MOVOU XWORD6, 96(BX)
|
||||||
MOVOU XWORD7, 112(BX)
|
MOVOU XWORD7, 112(BX)
|
||||||
|
|
||||||
done_sm4:
|
JMP cbcSm4Octets
|
||||||
|
|
||||||
|
cbcSm4Nibbles:
|
||||||
|
CMPQ DI, $64
|
||||||
|
JLE cbCSm4Single
|
||||||
|
SUBQ $64, DI
|
||||||
|
LEAQ -64(DX), DX
|
||||||
|
LEAQ -64(BX), BX
|
||||||
|
|
||||||
|
MOVOU 0(DX), XWORD0
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
PXOR -16(DX), XWORD0
|
||||||
|
PXOR 0(DX), XWORD1
|
||||||
|
PXOR 16(DX), XWORD2
|
||||||
|
PXOR 32(DX), XWORD3
|
||||||
|
|
||||||
|
MOVUPS XWORD0, 0(BX)
|
||||||
|
MOVUPS XWORD1, 16(BX)
|
||||||
|
MOVUPS XWORD2, 32(BX)
|
||||||
|
MOVUPS XWORD3, 48(BX)
|
||||||
|
|
||||||
|
cbCSm4Single:
|
||||||
|
CMPQ DI, $16
|
||||||
|
JEQ cbcSm4Single16
|
||||||
|
|
||||||
|
CMPQ DI, $32
|
||||||
|
JEQ cbcSm4Single32
|
||||||
|
|
||||||
|
CMPQ DI, $48
|
||||||
|
JEQ cbcSm4Single48
|
||||||
|
|
||||||
|
MOVOU -64(DX), XWORD0
|
||||||
|
MOVOU -48(DX), XWORD1
|
||||||
|
MOVOU -32(DX), XWORD2
|
||||||
|
MOVOU -16(DX), XWORD3
|
||||||
|
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
PXOR 0(SI), XWORD0
|
||||||
|
PXOR -64(DX), XWORD1
|
||||||
|
PXOR -48(DX), XWORD2
|
||||||
|
PXOR -32(DX), XWORD3
|
||||||
|
|
||||||
|
MOVUPS XWORD0, -64(BX)
|
||||||
|
MOVUPS XWORD1, -48(BX)
|
||||||
|
MOVUPS XWORD2, -32(BX)
|
||||||
|
MOVUPS XWORD3, -16(BX)
|
||||||
|
JMP cbcSm4Done
|
||||||
|
|
||||||
|
cbcSm4Single16:
|
||||||
|
MOVOU -16(DX), XWORD0
|
||||||
|
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
PXOR 0(SI), XWORD0
|
||||||
|
|
||||||
|
MOVUPS XWORD0, -16(BX)
|
||||||
|
JMP cbcSm4Done
|
||||||
|
|
||||||
|
cbcSm4Single32:
|
||||||
|
MOVOU -32(DX), XWORD0
|
||||||
|
MOVOU -16(DX), XWORD1
|
||||||
|
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
PXOR 0(SI), XWORD0
|
||||||
|
PXOR -32(DX), XWORD1
|
||||||
|
|
||||||
|
MOVUPS XWORD0, -32(BX)
|
||||||
|
MOVUPS XWORD1, -16(BX)
|
||||||
|
JMP cbcSm4Done
|
||||||
|
|
||||||
|
cbcSm4Single48:
|
||||||
|
MOVOU -48(DX), XWORD0
|
||||||
|
MOVOU -32(DX), XWORD1
|
||||||
|
MOVOU -16(DX), XWORD2
|
||||||
|
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
PXOR 0(SI), XWORD0
|
||||||
|
PXOR -48(DX), XWORD1
|
||||||
|
PXOR -32(DX), XWORD2
|
||||||
|
|
||||||
|
MOVUPS XWORD0, -48(BX)
|
||||||
|
MOVUPS XWORD1, -32(BX)
|
||||||
|
MOVUPS XWORD2, -16(BX)
|
||||||
|
|
||||||
|
cbcSm4Done:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx:
|
avxCbcSm4Octets:
|
||||||
CMPQ DI, $128
|
CMPQ DI, $128
|
||||||
JEQ avx_8blocks
|
JLE avxCbcSm4Nibbles
|
||||||
|
SUBQ $128, DI
|
||||||
|
LEAQ -128(DX), DX
|
||||||
|
LEAQ -128(BX), BX
|
||||||
|
|
||||||
VMOVDQU 0(DX), XWORD0
|
|
||||||
VMOVDQU 16(DX), XWORD1
|
|
||||||
VMOVDQU 32(DX), XWORD2
|
|
||||||
VMOVDQU 48(DX), XWORD3
|
|
||||||
|
|
||||||
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
|
||||||
VPXOR 16(SI), XWORD1, XWORD1
|
|
||||||
VPXOR 32(SI), XWORD2, XWORD2
|
|
||||||
VPXOR 48(SI), XWORD3, XWORD3
|
|
||||||
|
|
||||||
VMOVDQU XWORD0, 0(BX)
|
|
||||||
VMOVDQU XWORD1, 16(BX)
|
|
||||||
VMOVDQU XWORD2, 32(BX)
|
|
||||||
VMOVDQU XWORD3, 48(BX)
|
|
||||||
RET
|
|
||||||
|
|
||||||
avx_8blocks:
|
|
||||||
VMOVDQU 0(DX), XWORD0
|
VMOVDQU 0(DX), XWORD0
|
||||||
VMOVDQU 16(DX), XWORD1
|
VMOVDQU 16(DX), XWORD1
|
||||||
VMOVDQU 32(DX), XWORD2
|
VMOVDQU 32(DX), XWORD2
|
||||||
@ -220,14 +283,14 @@ avx_8blocks:
|
|||||||
|
|
||||||
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
VPXOR 0(SI), XWORD0, XWORD0
|
VPXOR -16(DX), XWORD0, XWORD0
|
||||||
VPXOR 16(SI), XWORD1, XWORD1
|
VPXOR 0(DX), XWORD1, XWORD1
|
||||||
VPXOR 32(SI), XWORD2, XWORD2
|
VPXOR 16(DX), XWORD2, XWORD2
|
||||||
VPXOR 48(SI), XWORD3, XWORD3
|
VPXOR 32(DX), XWORD3, XWORD3
|
||||||
VPXOR 64(SI), XWORD4, XWORD4
|
VPXOR 48(DX), XWORD4, XWORD4
|
||||||
VPXOR 80(SI), XWORD5, XWORD5
|
VPXOR 64(DX), XWORD5, XWORD5
|
||||||
VPXOR 96(SI), XWORD6, XWORD6
|
VPXOR 80(DX), XWORD6, XWORD6
|
||||||
VPXOR 112(SI), XWORD7, XWORD7
|
VPXOR 96(DX), XWORD7, XWORD7
|
||||||
|
|
||||||
VMOVDQU XWORD0, 0(BX)
|
VMOVDQU XWORD0, 0(BX)
|
||||||
VMOVDQU XWORD1, 16(BX)
|
VMOVDQU XWORD1, 16(BX)
|
||||||
@ -238,55 +301,113 @@ avx_8blocks:
|
|||||||
VMOVDQU XWORD6, 96(BX)
|
VMOVDQU XWORD6, 96(BX)
|
||||||
VMOVDQU XWORD7, 112(BX)
|
VMOVDQU XWORD7, 112(BX)
|
||||||
|
|
||||||
avx_sm4_done:
|
JMP avxCbcSm4Octets
|
||||||
|
|
||||||
|
avxCbcSm4Nibbles:
|
||||||
|
CMPQ DI, $64
|
||||||
|
JLE avxCbCSm4Single
|
||||||
|
SUBQ $64, DI
|
||||||
|
LEAQ -64(DX), DX
|
||||||
|
LEAQ -64(BX), BX
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR -16(DX), XWORD0, XWORD0
|
||||||
|
VPXOR 0(DX), XWORD1, XWORD1
|
||||||
|
VPXOR 16(DX), XWORD2, XWORD2
|
||||||
|
VPXOR 32(DX), XWORD3, XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
avxCbCSm4Single:
|
||||||
|
CMPQ DI, $16
|
||||||
|
JEQ avxCbcSm4Single16
|
||||||
|
|
||||||
|
CMPQ DI, $32
|
||||||
|
JEQ avxCbcSm4Single32
|
||||||
|
|
||||||
|
CMPQ DI, $48
|
||||||
|
JEQ avxCbcSm4Single48
|
||||||
|
|
||||||
|
VMOVDQU -64(DX), XWORD0
|
||||||
|
VMOVDQU -48(DX), XWORD1
|
||||||
|
VMOVDQU -32(DX), XWORD2
|
||||||
|
VMOVDQU -16(DX), XWORD3
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR -64(DX), XWORD1, XWORD1
|
||||||
|
VPXOR -48(DX), XWORD2, XWORD2
|
||||||
|
VPXOR -32(DX), XWORD3, XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -64(BX)
|
||||||
|
VMOVDQU XWORD1, -48(BX)
|
||||||
|
VMOVDQU XWORD2, -32(BX)
|
||||||
|
VMOVDQU XWORD3, -16(BX)
|
||||||
|
JMP avxCbcSm4Done
|
||||||
|
|
||||||
|
avxCbcSm4Single16:
|
||||||
|
VMOVDQU -16(DX), XWORD0
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -16(BX)
|
||||||
|
JMP avxCbcSm4Done
|
||||||
|
|
||||||
|
avxCbcSm4Single32:
|
||||||
|
VMOVDQU -32(DX), XWORD0
|
||||||
|
VMOVDQU -16(DX), XWORD1
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR -32(DX), XWORD1, XWORD1
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -32(BX)
|
||||||
|
VMOVDQU XWORD1, -16(BX)
|
||||||
|
JMP avxCbcSm4Done
|
||||||
|
|
||||||
|
avxCbcSm4Single48:
|
||||||
|
VMOVDQU -48(DX), XWORD0
|
||||||
|
VMOVDQU -32(DX), XWORD1
|
||||||
|
VMOVDQU -16(DX), XWORD2
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR -48(DX), XWORD1, XWORD1
|
||||||
|
VPXOR -32(DX), XWORD2, XWORD2
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -48(BX)
|
||||||
|
VMOVDQU XWORD1, -32(BX)
|
||||||
|
VMOVDQU XWORD2, -16(BX)
|
||||||
|
|
||||||
|
avxCbcSm4Done:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2Start:
|
||||||
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
CMPQ DI, $256
|
|
||||||
JEQ avx2_16blocks
|
|
||||||
|
|
||||||
avx2_8blocks:
|
|
||||||
VMOVDQU 0(DX), XDWORD0
|
|
||||||
VMOVDQU 32(DX), XDWORD1
|
|
||||||
VMOVDQU 64(DX), XDWORD2
|
|
||||||
VMOVDQU 96(DX), XDWORD3
|
|
||||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||||
|
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
|
||||||
|
|
||||||
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
|
||||||
|
|
||||||
VPXOR 0(SI), XDWORD0, XDWORD0
|
|
||||||
VPXOR 32(SI), XDWORD1, XDWORD1
|
|
||||||
VPXOR 64(SI), XDWORD2, XDWORD2
|
|
||||||
VPXOR 96(SI), XDWORD3, XDWORD3
|
|
||||||
|
|
||||||
VMOVDQU XDWORD0, 0(BX)
|
|
||||||
VMOVDQU XDWORD1, 32(BX)
|
|
||||||
VMOVDQU XDWORD2, 64(BX)
|
|
||||||
VMOVDQU XDWORD3, 96(BX)
|
|
||||||
|
|
||||||
VZEROUPPER
|
|
||||||
RET
|
|
||||||
|
|
||||||
avx2_16blocks:
|
avx2_16blocks:
|
||||||
|
CMPQ DI, $256
|
||||||
|
JLE avx2CbcSm4Octets
|
||||||
|
SUBQ $256, DI
|
||||||
|
LEAQ -256(DX), DX
|
||||||
|
LEAQ -256(BX), BX
|
||||||
|
|
||||||
VMOVDQU 0(DX), XDWORD0
|
VMOVDQU 0(DX), XDWORD0
|
||||||
VMOVDQU 32(DX), XDWORD1
|
VMOVDQU 32(DX), XDWORD1
|
||||||
VMOVDQU 64(DX), XDWORD2
|
VMOVDQU 64(DX), XDWORD2
|
||||||
@ -296,8 +417,6 @@ avx2_16blocks:
|
|||||||
VMOVDQU 192(DX), XDWORD6
|
VMOVDQU 192(DX), XDWORD6
|
||||||
VMOVDQU 224(DX), XDWORD7
|
VMOVDQU 224(DX), XDWORD7
|
||||||
|
|
||||||
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
|
||||||
|
|
||||||
// Apply Byte Flip Mask: LE -> BE
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
@ -309,33 +428,32 @@ avx2_16blocks:
|
|||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||||
|
|
||||||
// Transpose matrix 4 x 4 32bits word
|
// Transpose matrix 4 x 4 32bits word
|
||||||
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
|
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
|
||||||
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
|
||||||
|
|
||||||
VPXOR 0(SI), XDWORD0, XDWORD0
|
VPXOR -16(DX), XDWORD0, XDWORD0
|
||||||
VPXOR 32(SI), XDWORD1, XDWORD1
|
VPXOR 16(DX), XDWORD1, XDWORD1
|
||||||
VPXOR 64(SI), XDWORD2, XDWORD2
|
VPXOR 48(DX), XDWORD2, XDWORD2
|
||||||
VPXOR 96(SI), XDWORD3, XDWORD3
|
VPXOR 80(DX), XDWORD3, XDWORD3
|
||||||
VPXOR 128(SI), XDWORD4, XDWORD4
|
VPXOR 112(DX), XDWORD4, XDWORD4
|
||||||
VPXOR 160(SI), XDWORD5, XDWORD5
|
VPXOR 144(DX), XDWORD5, XDWORD5
|
||||||
VPXOR 192(SI), XDWORD6, XDWORD6
|
VPXOR 176(DX), XDWORD6, XDWORD6
|
||||||
VPXOR 224(SI), XDWORD7, XDWORD7
|
VPXOR 208(DX), XDWORD7, XDWORD7
|
||||||
|
|
||||||
VMOVDQU XDWORD0, 0(BX)
|
VMOVDQU XDWORD0, 0(BX)
|
||||||
VMOVDQU XDWORD1, 32(BX)
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
@ -346,6 +464,141 @@ avx2_16blocks:
|
|||||||
VMOVDQU XDWORD6, 192(BX)
|
VMOVDQU XDWORD6, 192(BX)
|
||||||
VMOVDQU XDWORD7, 224(BX)
|
VMOVDQU XDWORD7, 224(BX)
|
||||||
|
|
||||||
avx2_sm4_done:
|
JMP avx2_16blocks
|
||||||
|
|
||||||
|
avx2CbcSm4Octets:
|
||||||
|
CMPQ DI, $128
|
||||||
|
JLE avx2CbcSm4Nibbles
|
||||||
|
SUBQ $128, DI
|
||||||
|
LEAQ -128(DX), DX
|
||||||
|
LEAQ -128(BX), BX
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XDWORD0
|
||||||
|
VMOVDQU 32(DX), XDWORD1
|
||||||
|
VMOVDQU 64(DX), XDWORD2
|
||||||
|
VMOVDQU 96(DX), XDWORD3
|
||||||
|
|
||||||
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
|
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||||
|
|
||||||
|
VPXOR -16(DX), XDWORD0, XDWORD0
|
||||||
|
VPXOR 16(DX), XDWORD1, XDWORD1
|
||||||
|
VPXOR 48(DX), XDWORD2, XDWORD2
|
||||||
|
VPXOR 80(DX), XDWORD3, XDWORD3
|
||||||
|
|
||||||
|
VMOVDQU XDWORD0, 0(BX)
|
||||||
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
|
|
||||||
|
JMP avx2CbcSm4Octets
|
||||||
|
|
||||||
|
avx2CbcSm4Nibbles:
|
||||||
|
CMPQ DI, $64
|
||||||
|
JLE avx2CbCSm4Single
|
||||||
|
SUBQ $64, DI
|
||||||
|
LEAQ -64(DX), DX
|
||||||
|
LEAQ -64(BX), BX
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR -16(DX), XWORD0, XWORD0
|
||||||
|
VPXOR 0(DX), XWORD1, XWORD1
|
||||||
|
VPXOR 16(DX), XWORD2, XWORD2
|
||||||
|
VPXOR 32(DX), XWORD3, XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
avx2CbCSm4Single:
|
||||||
|
CMPQ DI, $16
|
||||||
|
JEQ avx2CbcSm4Single16
|
||||||
|
|
||||||
|
CMPQ DI, $32
|
||||||
|
JEQ avx2CbcSm4Single32
|
||||||
|
|
||||||
|
CMPQ DI, $48
|
||||||
|
JEQ avx2CbcSm4Single48
|
||||||
|
|
||||||
|
VMOVDQU -64(DX), XWORD0
|
||||||
|
VMOVDQU -48(DX), XWORD1
|
||||||
|
VMOVDQU -32(DX), XWORD2
|
||||||
|
VMOVDQU -16(DX), XWORD3
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR -64(DX), XWORD1, XWORD1
|
||||||
|
VPXOR -48(DX), XWORD2, XWORD2
|
||||||
|
VPXOR -32(DX), XWORD3, XWORD3
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -64(BX)
|
||||||
|
VMOVDQU XWORD1, -48(BX)
|
||||||
|
VMOVDQU XWORD2, -32(BX)
|
||||||
|
VMOVDQU XWORD3, -16(BX)
|
||||||
|
JMP avx2CbcSm4Done
|
||||||
|
|
||||||
|
avx2CbcSm4Single16:
|
||||||
|
VMOVDQU -16(DX), XWORD0
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -16(BX)
|
||||||
|
JMP avx2CbcSm4Done
|
||||||
|
|
||||||
|
avx2CbcSm4Single32:
|
||||||
|
VMOVDQU -32(DX), XWORD0
|
||||||
|
VMOVDQU -16(DX), XWORD1
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR -32(DX), XWORD1, XWORD1
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -32(BX)
|
||||||
|
VMOVDQU XWORD1, -16(BX)
|
||||||
|
JMP avx2CbcSm4Done
|
||||||
|
|
||||||
|
avx2CbcSm4Single48:
|
||||||
|
VMOVDQU -48(DX), XWORD0
|
||||||
|
VMOVDQU -32(DX), XWORD1
|
||||||
|
VMOVDQU -16(DX), XWORD2
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VPXOR 0(SI), XWORD0, XWORD0
|
||||||
|
VPXOR -48(DX), XWORD1, XWORD1
|
||||||
|
VPXOR -32(DX), XWORD2, XWORD2
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, -48(BX)
|
||||||
|
VMOVDQU XWORD1, -32(BX)
|
||||||
|
VMOVDQU XWORD2, -16(BX)
|
||||||
|
|
||||||
|
avx2CbcSm4Done:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
RET
|
RET
|
||||||
|
@ -93,56 +93,35 @@ done_sm4:
|
|||||||
#define t6 V12
|
#define t6 V12
|
||||||
#define t7 V13
|
#define t7 V13
|
||||||
|
|
||||||
|
#define dstPtr R1
|
||||||
|
#define srcPtr R2
|
||||||
|
#define rk R3
|
||||||
|
#define rkSave R4
|
||||||
|
#define srcPtrLen R5
|
||||||
|
#define IV V18
|
||||||
|
|
||||||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
|
||||||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
|
||||||
LOAD_SM4_AESNI_CONSTS()
|
LOAD_SM4_AESNI_CONSTS()
|
||||||
|
|
||||||
MOVD xk+0(FP), R8
|
|
||||||
MOVD dst+8(FP), R9
|
|
||||||
MOVD src+32(FP), R10
|
|
||||||
MOVD src_len+40(FP), R12
|
|
||||||
MOVD iv+56(FP), R11
|
|
||||||
|
|
||||||
CMP $128, R12
|
|
||||||
BEQ double_dec
|
|
||||||
|
|
||||||
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
|
||||||
VREV32 t0.B16, t0.B16
|
|
||||||
VREV32 t1.B16, t1.B16
|
|
||||||
VREV32 t2.B16, t2.B16
|
|
||||||
VREV32 t3.B16, t3.B16
|
|
||||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
|
||||||
|
|
||||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
EOR R0, R0
|
|
||||||
|
|
||||||
encryptBlocksLoop:
|
MOVD xk+0(FP), rk
|
||||||
SM4_ROUND(R8, R19, x, y, XTMP6, t0, t1, t2, t3)
|
MOVD dst+8(FP), dstPtr
|
||||||
SM4_ROUND(R8, R19, x, y, XTMP6, t1, t2, t3, t0)
|
MOVD src+32(FP), srcPtr
|
||||||
SM4_ROUND(R8, R19, x, y, XTMP6, t2, t3, t0, t1)
|
MOVD src_len+40(FP), srcPtrLen
|
||||||
SM4_ROUND(R8, R19, x, y, XTMP6, t3, t0, t1, t2)
|
MOVD iv+56(FP), R6
|
||||||
|
MOVD rk, rkSave
|
||||||
|
VLD1 (R6), [IV]
|
||||||
|
|
||||||
ADD $16, R0
|
cbcSm4Octets:
|
||||||
CMP $128, R0
|
CMP $128, srcPtrLen
|
||||||
BNE encryptBlocksLoop
|
BLE cbcSm4Nibbles
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
MOVD rkSave, rk
|
||||||
|
ADD srcPtr, srcPtrLen, R10
|
||||||
|
SUB $16, R10, R11
|
||||||
|
ADD dstPtr, srcPtrLen, R12
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
|
||||||
VREV32 t0.B16, t0.B16
|
|
||||||
VREV32 t1.B16, t1.B16
|
|
||||||
VREV32 t2.B16, t2.B16
|
|
||||||
VREV32 t3.B16, t3.B16
|
|
||||||
|
|
||||||
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
|
||||||
|
|
||||||
VEOR V6.B16, t0.B16, t0.B16
|
|
||||||
VEOR V7.B16, t1.B16, t1.B16
|
|
||||||
VEOR V8.B16, t2.B16, t2.B16
|
|
||||||
VEOR V9.B16, t3.B16, t3.B16
|
|
||||||
|
|
||||||
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R9)
|
|
||||||
RET
|
|
||||||
|
|
||||||
double_dec:
|
|
||||||
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
VLD1.P 64(R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
VLD1.P 64(R10), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||||
VREV32 t0.B16, t0.B16
|
VREV32 t0.B16, t0.B16
|
||||||
@ -153,21 +132,20 @@ double_dec:
|
|||||||
VREV32 t5.B16, t5.B16
|
VREV32 t5.B16, t5.B16
|
||||||
VREV32 t6.B16, t6.B16
|
VREV32 t6.B16, t6.B16
|
||||||
VREV32 t7.B16, t7.B16
|
VREV32 t7.B16, t7.B16
|
||||||
|
|
||||||
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
|
||||||
EOR R0, R0
|
EOR R0, R0
|
||||||
|
|
||||||
decrypt8BlocksLoop:
|
cbc8BlocksLoop:
|
||||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||||
SM4_8BLOCKS_ROUND(R8, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
SM4_8BLOCKS_ROUND(rk, R19, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||||
|
|
||||||
ADD $16, R0
|
ADD $16, R0
|
||||||
CMP $128, R0
|
CMP $128, R0
|
||||||
BNE decrypt8BlocksLoop
|
BNE cbc8BlocksLoop
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
@ -192,7 +170,180 @@ decrypt8BlocksLoop:
|
|||||||
VEOR V8.B16, t6.B16, t6.B16
|
VEOR V8.B16, t6.B16, t6.B16
|
||||||
VEOR V9.B16, t7.B16, t7.B16
|
VEOR V9.B16, t7.B16, t7.B16
|
||||||
|
|
||||||
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R9)
|
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(R12)
|
||||||
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R9)
|
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(R12)
|
||||||
|
|
||||||
|
B cbcSm4Octets
|
||||||
|
|
||||||
|
cbcSm4Nibbles:
|
||||||
|
CMP $64, srcPtrLen
|
||||||
|
BLE cbcSm4Single
|
||||||
|
SUB $64, srcPtrLen
|
||||||
|
MOVD rkSave, rk
|
||||||
|
ADD srcPtr, srcPtrLen, R10
|
||||||
|
SUB $16, R10, R11
|
||||||
|
ADD dstPtr, srcPtrLen, R12
|
||||||
|
|
||||||
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
EOR R0, R0
|
||||||
|
|
||||||
|
cbc4BlocksLoop:
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE cbc4BlocksLoop
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
|
||||||
|
VLD1 (R11), [V6.S4, V7.S4, V8.S4, V9.S4]
|
||||||
|
VEOR V6.B16, t0.B16, t0.B16
|
||||||
|
VEOR V7.B16, t1.B16, t1.B16
|
||||||
|
VEOR V8.B16, t2.B16, t2.B16
|
||||||
|
VEOR V9.B16, t3.B16, t3.B16
|
||||||
|
|
||||||
|
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (R12)
|
||||||
|
|
||||||
|
cbcSm4Single:
|
||||||
|
MOVD rkSave, rk
|
||||||
|
EOR R0, R0
|
||||||
|
MOVD srcPtr, R10
|
||||||
|
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BEQ cbcSm4Single16
|
||||||
|
|
||||||
|
CMP $32, srcPtrLen
|
||||||
|
BEQ cbcSm4Single32
|
||||||
|
|
||||||
|
CMP $48, srcPtrLen
|
||||||
|
BEQ cbcSm4Single48
|
||||||
|
|
||||||
|
// 4 blocks
|
||||||
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
cbc4BlocksLoop64:
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE cbc4BlocksLoop64
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
|
||||||
|
VLD1 (srcPtr), [V6.S4, V7.S4, V8.S4]
|
||||||
|
VEOR IV.B16, t0.B16, t0.B16
|
||||||
|
VEOR V6.B16, t1.B16, t1.B16
|
||||||
|
VEOR V7.B16, t2.B16, t2.B16
|
||||||
|
VEOR V8.B16, t3.B16, t3.B16
|
||||||
|
|
||||||
|
VST1 [t0.S4, t1.S4, t2.S4, t3.S4], (dstPtr)
|
||||||
|
|
||||||
|
B cbcSm4Done
|
||||||
|
|
||||||
|
cbcSm4Single16:
|
||||||
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
cbc4BlocksLoop16:
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE cbc4BlocksLoop16
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
|
||||||
|
VEOR IV.B16, t0.B16, t0.B16
|
||||||
|
|
||||||
|
VST1 [t0.S4], (dstPtr)
|
||||||
|
|
||||||
|
B cbcSm4Done
|
||||||
|
|
||||||
|
cbcSm4Single32:
|
||||||
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
cbc4BlocksLoop32:
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE cbc4BlocksLoop32
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
|
||||||
|
VLD1 (srcPtr), [V6.S4]
|
||||||
|
VEOR IV.B16, t0.B16, t0.B16
|
||||||
|
VEOR V6.B16, t1.B16, t1.B16
|
||||||
|
|
||||||
|
VST1 [t0.S4, t1.S4], (dstPtr)
|
||||||
|
B cbcSm4Done
|
||||||
|
|
||||||
|
cbcSm4Single48:
|
||||||
|
VLD1 (R10), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
cbc4BlocksLoop48:
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R19, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE cbc4BlocksLoop48
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
|
||||||
|
VLD1 (srcPtr), [V6.S4, V7.S4]
|
||||||
|
VEOR IV.B16, t0.B16, t0.B16
|
||||||
|
VEOR V6.B16, t1.B16, t1.B16
|
||||||
|
VEOR V7.B16, t2.B16, t2.B16
|
||||||
|
|
||||||
|
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)
|
||||||
|
|
||||||
|
cbcSm4Done:
|
||||||
RET
|
RET
|
||||||
|
@ -49,43 +49,17 @@ func (b *sm4CipherAsm) NewECBDecrypter() cipher.BlockMode {
|
|||||||
|
|
||||||
func (x *ecb) BlockSize() int { return BlockSize }
|
func (x *ecb) BlockSize() int { return BlockSize }
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||||
|
|
||||||
func (x *ecb) CryptBlocks(dst, src []byte) {
|
func (x *ecb) CryptBlocks(dst, src []byte) {
|
||||||
x.validate(dst, src)
|
x.validate(dst, src)
|
||||||
if len(src) == 0 {
|
if len(src) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for len(src) >= 2*x.b.blocksSize {
|
xk := &x.b.enc[0]
|
||||||
if x.enc == ecbEncrypt {
|
if x.enc == ecbDecrypt {
|
||||||
x.b.EncryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
xk = &x.b.dec[0]
|
||||||
} else {
|
|
||||||
x.b.DecryptBlocks(dst[:2*x.b.blocksSize], src[:2*x.b.blocksSize])
|
|
||||||
}
|
|
||||||
src = src[2*x.b.blocksSize:]
|
|
||||||
dst = dst[2*x.b.blocksSize:]
|
|
||||||
}
|
|
||||||
for len(src) >= x.b.blocksSize {
|
|
||||||
if x.enc == ecbEncrypt {
|
|
||||||
x.b.EncryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
|
||||||
} else {
|
|
||||||
x.b.DecryptBlocks(dst[:x.b.blocksSize], src[:x.b.blocksSize])
|
|
||||||
}
|
|
||||||
src = src[x.b.blocksSize:]
|
|
||||||
dst = dst[x.b.blocksSize:]
|
|
||||||
}
|
|
||||||
if len(src) > BlockSize {
|
|
||||||
temp := make([]byte, x.b.blocksSize)
|
|
||||||
copy(temp, src)
|
|
||||||
if x.enc == ecbEncrypt {
|
|
||||||
x.b.EncryptBlocks(temp, temp)
|
|
||||||
} else {
|
|
||||||
x.b.DecryptBlocks(temp, temp)
|
|
||||||
}
|
|
||||||
copy(dst, temp[:len(src)])
|
|
||||||
} else if len(src) > 0 {
|
|
||||||
if x.enc == ecbEncrypt {
|
|
||||||
x.b.Encrypt(dst, src)
|
|
||||||
} else {
|
|
||||||
x.b.Decrypt(dst, src)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
encryptSm4Ecb(xk, dst, src)
|
||||||
}
|
}
|
||||||
|
371
sm4/ecb_sm4_amd64.s
Normal file
371
sm4/ecb_sm4_amd64.s
Normal file
@ -0,0 +1,371 @@
|
|||||||
|
//go:build amd64 && !purego
|
||||||
|
// +build amd64,!purego
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
#include "aesni_macros_amd64.s"
|
||||||
|
|
||||||
|
#define XDWTMP0 Y0
|
||||||
|
#define XDWTMP1 Y1
|
||||||
|
|
||||||
|
#define XDWORD0 Y4
|
||||||
|
#define XDWORD1 Y5
|
||||||
|
#define XDWORD2 Y6
|
||||||
|
#define XDWORD3 Y7
|
||||||
|
|
||||||
|
#define XDWORD4 Y10
|
||||||
|
#define XDWORD5 Y11
|
||||||
|
#define XDWORD6 Y12
|
||||||
|
#define XDWORD7 Y14
|
||||||
|
|
||||||
|
#define XWTMP0 X0
|
||||||
|
#define XWTMP1 X1
|
||||||
|
#define XWTMP2 X2
|
||||||
|
|
||||||
|
#define XWORD0 X4
|
||||||
|
#define XWORD1 X5
|
||||||
|
#define XWORD2 X6
|
||||||
|
#define XWORD3 X7
|
||||||
|
|
||||||
|
#define XWORD4 X10
|
||||||
|
#define XWORD5 X11
|
||||||
|
#define XWORD6 X12
|
||||||
|
#define XWORD7 X14
|
||||||
|
|
||||||
|
#define NIBBLE_MASK Y3
|
||||||
|
#define X_NIBBLE_MASK X3
|
||||||
|
|
||||||
|
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||||
|
#define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE
|
||||||
|
|
||||||
|
#define BSWAP_MASK Y2
|
||||||
|
|
||||||
|
#define XDWORD Y8
|
||||||
|
#define YDWORD Y9
|
||||||
|
|
||||||
|
#define XWORD X8
|
||||||
|
#define YWORD X9
|
||||||
|
|
||||||
|
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||||
|
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
|
||||||
|
MOVQ xk+0(FP), AX
|
||||||
|
MOVQ dst+8(FP), BX
|
||||||
|
MOVQ src+32(FP), DX
|
||||||
|
MOVQ src_len+40(FP), DI
|
||||||
|
|
||||||
|
CMPB ·useAVX2(SB), $1
|
||||||
|
JE avx2_start
|
||||||
|
|
||||||
|
CMPB ·useAVX(SB), $1
|
||||||
|
JE avxEcbSm4Octets
|
||||||
|
|
||||||
|
ecbSm4Octets:
|
||||||
|
CMPQ DI, $128
|
||||||
|
JB ecbSm4Nibbles
|
||||||
|
SUBQ $128, DI
|
||||||
|
|
||||||
|
MOVOU 0(DX), XWORD0
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
MOVOU 64(DX), XWORD4
|
||||||
|
MOVOU 80(DX), XWORD5
|
||||||
|
MOVOU 96(DX), XWORD6
|
||||||
|
MOVOU 112(DX), XWORD7
|
||||||
|
|
||||||
|
SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
|
MOVOU XWORD0, 0(BX)
|
||||||
|
MOVOU XWORD1, 16(BX)
|
||||||
|
MOVOU XWORD2, 32(BX)
|
||||||
|
MOVOU XWORD3, 48(BX)
|
||||||
|
MOVOU XWORD4, 64(BX)
|
||||||
|
MOVOU XWORD5, 80(BX)
|
||||||
|
MOVOU XWORD6, 96(BX)
|
||||||
|
MOVOU XWORD7, 112(BX)
|
||||||
|
|
||||||
|
LEAQ 128(BX), BX
|
||||||
|
LEAQ 128(DX), DX
|
||||||
|
JMP ecbSm4Octets
|
||||||
|
|
||||||
|
ecbSm4Nibbles:
|
||||||
|
CMPQ DI, $64
|
||||||
|
JB ecbSm4Single
|
||||||
|
SUBQ $64, DI
|
||||||
|
|
||||||
|
MOVOU 0(DX), XWORD0
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
MOVOU 48(DX), XWORD3
|
||||||
|
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
MOVUPS XWORD0, 0(BX)
|
||||||
|
MOVUPS XWORD1, 16(BX)
|
||||||
|
MOVUPS XWORD2, 32(BX)
|
||||||
|
MOVUPS XWORD3, 48(BX)
|
||||||
|
|
||||||
|
LEAQ 64(BX), BX
|
||||||
|
LEAQ 64(DX), DX
|
||||||
|
|
||||||
|
ecbSm4Single:
|
||||||
|
TESTQ DI, DI
|
||||||
|
JE ecbSm4Done
|
||||||
|
|
||||||
|
MOVOU 0(DX), XWORD0
|
||||||
|
CMPQ DI, $32
|
||||||
|
JEQ ecbSm4Single32
|
||||||
|
CMPQ DI, $48
|
||||||
|
JEQ ecbSm4Single48
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
MOVUPS XWORD0, 0(BX)
|
||||||
|
JMP ecbSm4Done
|
||||||
|
|
||||||
|
ecbSm4Single32:
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
MOVUPS XWORD0, 0(BX)
|
||||||
|
MOVUPS XWORD1, 16(BX)
|
||||||
|
JMP ecbSm4Done
|
||||||
|
|
||||||
|
ecbSm4Single48:
|
||||||
|
MOVOU 16(DX), XWORD1
|
||||||
|
MOVOU 32(DX), XWORD2
|
||||||
|
SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
MOVUPS XWORD0, 0(BX)
|
||||||
|
MOVUPS XWORD1, 16(BX)
|
||||||
|
MOVUPS XWORD2, 32(BX)
|
||||||
|
|
||||||
|
ecbSm4Done:
|
||||||
|
RET
|
||||||
|
|
||||||
|
avxEcbSm4Octets:
|
||||||
|
CMPQ DI, $128
|
||||||
|
JB avxEcbSm4Nibbles
|
||||||
|
SUBQ $128, DI
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
VMOVDQU 64(DX), XWORD4
|
||||||
|
VMOVDQU 80(DX), XWORD5
|
||||||
|
VMOVDQU 96(DX), XWORD6
|
||||||
|
VMOVDQU 112(DX), XWORD7
|
||||||
|
|
||||||
|
AVX_SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
VMOVDQU XWORD4, 64(BX)
|
||||||
|
VMOVDQU XWORD5, 80(BX)
|
||||||
|
VMOVDQU XWORD6, 96(BX)
|
||||||
|
VMOVDQU XWORD7, 112(BX)
|
||||||
|
|
||||||
|
LEAQ 128(BX), BX
|
||||||
|
LEAQ 128(DX), DX
|
||||||
|
JMP avxEcbSm4Octets
|
||||||
|
|
||||||
|
avxEcbSm4Nibbles:
|
||||||
|
CMPQ DI, $64
|
||||||
|
JB avxEcbSm4Single
|
||||||
|
SUBQ $64, DI
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
LEAQ 64(BX), BX
|
||||||
|
LEAQ 64(DX), DX
|
||||||
|
|
||||||
|
avxEcbSm4Single:
|
||||||
|
TESTQ DI, DI
|
||||||
|
JE avxEcbSm4Done
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
CMPQ DI, $32
|
||||||
|
JEQ avxEcbSm4Single32
|
||||||
|
CMPQ DI, $48
|
||||||
|
JEQ avxEcbSm4Single48
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
JMP avxEcbSm4Done
|
||||||
|
|
||||||
|
avxEcbSm4Single32:
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
JMP avxEcbSm4Done
|
||||||
|
|
||||||
|
avxEcbSm4Single48:
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
|
||||||
|
avxEcbSm4Done:
|
||||||
|
RET
|
||||||
|
|
||||||
|
avx2_start:
|
||||||
|
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
|
||||||
|
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
|
||||||
|
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
|
||||||
|
|
||||||
|
avx2_16blocks:
|
||||||
|
CMPQ DI, $256
|
||||||
|
JB avx2EcbSm4Octets
|
||||||
|
SUBQ $256, DI
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XDWORD0
|
||||||
|
VMOVDQU 32(DX), XDWORD1
|
||||||
|
VMOVDQU 64(DX), XDWORD2
|
||||||
|
VMOVDQU 96(DX), XDWORD3
|
||||||
|
VMOVDQU 128(DX), XDWORD4
|
||||||
|
VMOVDQU 160(DX), XDWORD5
|
||||||
|
VMOVDQU 192(DX), XDWORD6
|
||||||
|
VMOVDQU 224(DX), XDWORD7
|
||||||
|
|
||||||
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD4, XDWORD4
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD5, XDWORD5
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD6, XDWORD6
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD7, XDWORD7
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
|
AVX2_SM4_16BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWTMP1, XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWORD4, XDWORD5, XDWORD6, XDWORD7)
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
|
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD4, XDWORD4
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD5, XDWORD5
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD6, XDWORD6
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD7, XDWORD7
|
||||||
|
|
||||||
|
VMOVDQU XDWORD0, 0(BX)
|
||||||
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
|
VMOVDQU XDWORD4, 128(BX)
|
||||||
|
VMOVDQU XDWORD5, 160(BX)
|
||||||
|
VMOVDQU XDWORD6, 192(BX)
|
||||||
|
VMOVDQU XDWORD7, 224(BX)
|
||||||
|
|
||||||
|
LEAQ 256(BX), BX
|
||||||
|
LEAQ 256(DX), DX
|
||||||
|
JMP avx2_16blocks
|
||||||
|
|
||||||
|
avx2EcbSm4Octets:
|
||||||
|
CMPQ DI, $128
|
||||||
|
JB avx2EcbSm4Nibbles
|
||||||
|
SUBQ $128, DI
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XDWORD0
|
||||||
|
VMOVDQU 32(DX), XDWORD1
|
||||||
|
VMOVDQU 64(DX), XDWORD2
|
||||||
|
VMOVDQU 96(DX), XDWORD3
|
||||||
|
|
||||||
|
// Apply Byte Flip Mask: LE -> BE
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BYTE_FLIP_MASK, XDWORD3, XDWORD3
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
|
AVX2_SM4_8BLOCKS(AX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||||
|
|
||||||
|
// Transpose matrix 4 x 4 32bits word
|
||||||
|
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP0, XDWTMP1)
|
||||||
|
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD0, XDWORD0
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD1, XDWORD1
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD2, XDWORD2
|
||||||
|
VPSHUFB BSWAP_MASK, XDWORD3, XDWORD3
|
||||||
|
|
||||||
|
VMOVDQU XDWORD0, 0(BX)
|
||||||
|
VMOVDQU XDWORD1, 32(BX)
|
||||||
|
VMOVDQU XDWORD2, 64(BX)
|
||||||
|
VMOVDQU XDWORD3, 96(BX)
|
||||||
|
|
||||||
|
LEAQ 128(BX), BX
|
||||||
|
LEAQ 128(DX), DX
|
||||||
|
JMP avx2EcbSm4Octets
|
||||||
|
|
||||||
|
avx2EcbSm4Nibbles:
|
||||||
|
CMPQ DI, $64
|
||||||
|
JB avx2EcbSm4Single
|
||||||
|
SUBQ $64, DI
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
VMOVDQU 48(DX), XWORD3
|
||||||
|
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
VMOVDQU XWORD3, 48(BX)
|
||||||
|
|
||||||
|
LEAQ 64(BX), BX
|
||||||
|
LEAQ 64(DX), DX
|
||||||
|
|
||||||
|
avx2EcbSm4Single:
|
||||||
|
TESTQ DI, DI
|
||||||
|
JE avx2EcbSm4Done
|
||||||
|
|
||||||
|
VMOVDQU 0(DX), XWORD0
|
||||||
|
CMPQ DI, $32
|
||||||
|
JEQ avx2EcbSm4Single32
|
||||||
|
CMPQ DI, $48
|
||||||
|
JEQ avx2EcbSm4Single48
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
JMP avx2EcbSm4Done
|
||||||
|
|
||||||
|
avx2EcbSm4Single32:
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
JMP avx2EcbSm4Done
|
||||||
|
|
||||||
|
avx2EcbSm4Single48:
|
||||||
|
VMOVDQU 16(DX), XWORD1
|
||||||
|
VMOVDQU 32(DX), XWORD2
|
||||||
|
AVX_SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)
|
||||||
|
VMOVDQU XWORD0, 0(BX)
|
||||||
|
VMOVDQU XWORD1, 16(BX)
|
||||||
|
VMOVDQU XWORD2, 32(BX)
|
||||||
|
|
||||||
|
avx2EcbSm4Done:
|
||||||
|
VZEROUPPER
|
||||||
|
RET
|
207
sm4/ecb_sm4_arm64.s
Normal file
207
sm4/ecb_sm4_arm64.s
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
//go:build arm64 && !purego
|
||||||
|
// +build arm64,!purego
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
#define x V0
|
||||||
|
#define y V1
|
||||||
|
#define t0 V2
|
||||||
|
#define t1 V3
|
||||||
|
#define t2 V4
|
||||||
|
#define t3 V5
|
||||||
|
#define ZERO V16
|
||||||
|
#define NIBBLE_MASK V20
|
||||||
|
#define INVERSE_SHIFT_ROWS V21
|
||||||
|
#define M1L V22
|
||||||
|
#define M1H V23
|
||||||
|
#define M2L V24
|
||||||
|
#define M2H V25
|
||||||
|
#define R08_MASK V26
|
||||||
|
#define R16_MASK V27
|
||||||
|
#define R24_MASK V28
|
||||||
|
#define FK_MASK V29
|
||||||
|
#define XTMP6 V6
|
||||||
|
#define XTMP7 V7
|
||||||
|
#define t4 V10
|
||||||
|
#define t5 V11
|
||||||
|
#define t6 V12
|
||||||
|
#define t7 V13
|
||||||
|
|
||||||
|
#include "aesni_macros_arm64.s"
|
||||||
|
|
||||||
|
// func encryptSm4Ecb(xk *uint32, dst, src []byte)
|
||||||
|
TEXT ·encryptSm4Ecb(SB),NOSPLIT,$0
|
||||||
|
#define dstPtr R1
|
||||||
|
#define srcPtr R2
|
||||||
|
#define rk R3
|
||||||
|
#define rkSave R4
|
||||||
|
#define srcPtrLen R5
|
||||||
|
LOAD_SM4_AESNI_CONSTS()
|
||||||
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
||||||
|
|
||||||
|
MOVD xk+0(FP), rk
|
||||||
|
MOVD dst+8(FP), dstPtr
|
||||||
|
MOVD src+32(FP), srcPtr
|
||||||
|
MOVD src_len+40(FP), srcPtrLen
|
||||||
|
MOVD rk, rkSave
|
||||||
|
|
||||||
|
ecbSm4Octets:
|
||||||
|
CMP $128, srcPtrLen
|
||||||
|
BLT ecbSm4Nibbles
|
||||||
|
SUB $128, srcPtrLen
|
||||||
|
MOVD rkSave, rk
|
||||||
|
|
||||||
|
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VLD1.P 64(srcPtr), [t4.S4, t5.S4, t6.S4, t7.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VREV32 t4.B16, t4.B16
|
||||||
|
VREV32 t5.B16, t5.B16
|
||||||
|
VREV32 t6.B16, t6.B16
|
||||||
|
VREV32 t7.B16, t7.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
PRE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
EOR R0, R0
|
||||||
|
|
||||||
|
ecb8BlocksLoop:
|
||||||
|
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t0, t1, t2, t3, t4, t5, t6, t7)
|
||||||
|
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t1, t2, t3, t0, t5, t6, t7, t4)
|
||||||
|
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t2, t3, t0, t1, t6, t7, t4, t5)
|
||||||
|
SM4_8BLOCKS_ROUND(rk, R6, x, y, XTMP6, XTMP7, t3, t0, t1, t2, t7, t4, t5, t6)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE ecb8BlocksLoop
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VREV32 t4.B16, t4.B16
|
||||||
|
VREV32 t5.B16, t5.B16
|
||||||
|
VREV32 t6.B16, t6.B16
|
||||||
|
VREV32 t7.B16, t7.B16
|
||||||
|
|
||||||
|
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
|
||||||
|
VST1.P [t4.S4, t5.S4, t6.S4, t7.S4], 64(dstPtr)
|
||||||
|
|
||||||
|
B ecbSm4Octets
|
||||||
|
|
||||||
|
ecbSm4Nibbles:
|
||||||
|
CMP $64, srcPtrLen
|
||||||
|
BLT ecbSm4Single
|
||||||
|
SUB $64, srcPtrLen
|
||||||
|
MOVD rkSave, rk
|
||||||
|
|
||||||
|
VLD1.P 64(srcPtr), [t0.S4, t1.S4, t2.S4, t3.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
PRE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
|
||||||
|
EOR R0, R0
|
||||||
|
|
||||||
|
ecb4BlocksLoop:
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE ecb4BlocksLoop
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VREV32 t3.B16, t3.B16
|
||||||
|
VST1.P [t0.S4, t1.S4, t2.S4, t3.S4], 64(dstPtr)
|
||||||
|
|
||||||
|
ecbSm4Single:
|
||||||
|
CBZ srcPtrLen, ecbSm4Done
|
||||||
|
MOVD rkSave, rk
|
||||||
|
EOR R0, R0
|
||||||
|
|
||||||
|
CMP $16, srcPtrLen
|
||||||
|
BEQ ecbSm4Single16
|
||||||
|
|
||||||
|
CMP $32, srcPtrLen
|
||||||
|
BEQ ecbSm4Single32
|
||||||
|
|
||||||
|
CMP $48, srcPtrLen
|
||||||
|
BEQ ecbSm4Single48
|
||||||
|
|
||||||
|
ecbSm4Single16:
|
||||||
|
VLD1.P 16(srcPtr), [t0.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
|
||||||
|
encryptBlocksLoop1:
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE encryptBlocksLoop1
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VST1.P [t0.S4], 16(dstPtr)
|
||||||
|
|
||||||
|
B ecbSm4Done
|
||||||
|
|
||||||
|
ecbSm4Single32:
|
||||||
|
VLD1.P 32(srcPtr), [t0.S4, t1.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
|
||||||
|
encryptBlocksLoop2:
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE encryptBlocksLoop2
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VST1.P [t0.S4, t1.S4], 32(dstPtr)
|
||||||
|
|
||||||
|
B ecbSm4Done
|
||||||
|
|
||||||
|
ecbSm4Single48:
|
||||||
|
VLD1.P 48(srcPtr), [t0.S4, t1.S4, t2.S4]
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
|
||||||
|
encryptBlocksLoop3:
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t0, t1, t2, t3)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t1, t2, t3, t0)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t2, t3, t0, t1)
|
||||||
|
SM4_ROUND(rk, R6, x, y, XTMP6, t3, t0, t1, t2)
|
||||||
|
|
||||||
|
ADD $16, R0
|
||||||
|
CMP $128, R0
|
||||||
|
BNE encryptBlocksLoop3
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y, XTMP6, XTMP7)
|
||||||
|
VREV32 t0.B16, t0.B16
|
||||||
|
VREV32 t1.B16, t1.B16
|
||||||
|
VREV32 t2.B16, t2.B16
|
||||||
|
VST1.P [t0.S4, t1.S4, t2.S4], 48(dstPtr)
|
||||||
|
ecbSm4Done:
|
||||||
|
RET
|
Loading…
x
Reference in New Issue
Block a user