diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 799b21d..2a8c690 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -56,7 +56,23 @@ DATA r24_mask<>+0x00(SB)/8, $0x0407060500030201 DATA r24_mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 GLOBL r24_mask<>(SB), RODATA, $16 -#define SM4_TAO_L1(x, y) \ +DATA fk00_mask<>+0x00(SB)/8, $0xa3b1bac6a3b1bac6 +DATA fk00_mask<>+0x08(SB)/8, $0xa3b1bac6a3b1bac6 +GLOBL fk00_mask<>(SB), RODATA, $16 + +DATA fk01_mask<>+0x00(SB)/8, $0x56aa335056aa3350 +DATA fk01_mask<>+0x08(SB)/8, $0x56aa335056aa3350 +GLOBL fk01_mask<>(SB), RODATA, $16 + +DATA fk02_mask<>+0x00(SB)/8, $0x677d9197677d9197 +DATA fk02_mask<>+0x08(SB)/8, $0x677d9197677d9197 +GLOBL fk02_mask<>(SB), RODATA, $16 + +DATA fk03_mask<>+0x00(SB)/8, $0xb27022dcb27022dc +DATA fk03_mask<>+0x08(SB)/8, $0xb27022dcb27022dc +GLOBL fk03_mask<>(SB), RODATA, $16 + +#define SM4_SBOX(x, y) \ ; \ //############################# inner affine ############################// MOVOU x, XTMP6; \ PAND nibble_mask<>(SB), XTMP6; \ //y = _mm_and_si128(x, c0f); @@ -82,6 +98,9 @@ GLOBL r24_mask<>(SB), RODATA, $16 PSHUFB x, XTMP6; \ MOVOU XTMP6, x; \ //x = _mm_shuffle_epi8(m2h, x) PXOR y, x; \ //x = _mm_shuffle_epi8(m2h, x) ^ y; + +#define SM4_TAO_L1(x, y) \ + SM4_SBOX(x, y); \ ; \ //#################### 4 parallel L1 linear transforms ##################// MOVOU x, y; \ PSHUFB r08_mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) @@ -98,6 +117,96 @@ GLOBL r24_mask<>(SB), RODATA, $16 PXOR y, x; \ //x = x xor y PXOR XTMP7, x //x = x xor y xor _mm_shuffle_epi8(x, r24); +#define SM4_TAO_L2(x, y) \ + SM4_SBOX(x, y); \ + ; \ //#################### 4 parallel L2 linear transforms ##################// + MOVOU x, y; \ + MOVOU x, XTMP6; \ + PSLLL $13, XTMP6; \ + PSRLL $19, y; \ + PXOR XTMP6, y; \ //y = X roll 13 + PSLLL $10, XTMP6; \ + MOVOU x, XTMP7; \ + PSRLL $9, XTMP7; \ + PXOR XTMP6, XTMP7; \ //XTMP7 = x roll 23 + PXOR XTMP7, y; \ + PXOR y, x + +// func expandKeyAsm(key *byte, ck, enc, dec *uint32) +TEXT ·expandKeyAsm(SB),NOSPLIT,$0 + MOVQ key+0(FP), AX + MOVQ ck+8(FP), BX + MOVQ enc+16(FP), DX + MOVQ dec+24(FP), DI + + PINSRD $0, 0(AX), t0 + PSHUFB flip_mask<>(SB), t0 + PXOR fk00_mask<>(SB), t0 + + PINSRD $0, 4(AX), t1 + PSHUFB flip_mask<>(SB), t1 + PXOR fk01_mask<>(SB), t1 + + PINSRD $0, 8(AX), t2 + PSHUFB flip_mask<>(SB), t2 + PXOR fk02_mask<>(SB), t2 + + PINSRD $0, 12(AX), t3 + PSHUFB flip_mask<>(SB), t3 + PXOR fk03_mask<>(SB), t3 + + XORL CX, CX + MOVL $112, SI + +loop: + PINSRD $0, 0(BX)(CX*1), x + PXOR t1, x + PXOR t2, x + PXOR t3, x + SM4_TAO_L2(x, y) + PXOR x, t0 + PEXTRD $0, t0, R8 + MOVL R8, 0(DX)(CX*1) + MOVL R8, 12(DI)(SI*1) + + PINSRD $0, 4(BX)(CX*1), x + PXOR t0, x + PXOR t2, x + PXOR t3, x + SM4_TAO_L2(x, y) + PXOR x, t1 + PEXTRD $0, t1, R8 + MOVL R8, 4(DX)(CX*1) + MOVL R8, 8(DI)(SI*1) + + PINSRD $0, 8(BX)(CX*1), x + PXOR t0, x + PXOR t1, x + PXOR t3, x + SM4_TAO_L2(x, y) + PXOR x, t2 + PEXTRD $0, t2, R8 + MOVL R8, 8(DX)(CX*1) + MOVL R8, 4(DI)(SI*1) + + PINSRD $0, 12(BX)(CX*1), x + PXOR t0, x + PXOR t1, x + PXOR t2, x + SM4_TAO_L2(x, y) + PXOR x, t3 + PEXTRD $0, t3, R8 + MOVL R8, 12(DX)(CX*1) + MOVL R8, 0(DI)(SI*1) + + ADDL $16, CX + SUBL $16, SI + CMPL CX, $4*32 + JB loop + +expand_end: + RET + // func encryptBlocksAsm(xk *uint32, dst, src *byte) TEXT ·encryptBlocksAsm(SB),NOSPLIT,$0 MOVQ xk+0(FP), AX diff --git a/sm4/cipher_asm.go b/sm4/cipher_asm.go index 4632bd6..101ed60 100644 --- a/sm4/cipher_asm.go +++ b/sm4/cipher_asm.go @@ -11,6 +11,9 @@ import ( //go:noescape func encryptBlocksAsm(xk *uint32, dst, src *byte) +//go:noescape +func expandKeyAsm(key *byte, ck, enc, dec *uint32) + type sm4CipherAsm struct { sm4Cipher } @@ -23,7 +26,7 @@ func newCipher(key []byte) (cipher.Block, error) { return newCipherGeneric(key) } c := sm4CipherAsm{sm4Cipher{make([]uint32, rounds), make([]uint32, rounds)}} - expandKeyGo(key, c.enc, c.dec) + expandKeyAsm(&key[0], &ck[0], &c.enc[0], &c.dec[0]) if supportsAES && supportsGFMUL { return &sm4CipherGCM{c}, nil }