From 64cda2957bef76c7f2f40380325773680881c4de Mon Sep 17 00:00:00 2001 From: Emman Date: Wed, 12 Jan 2022 16:06:39 +0800 Subject: [PATCH] [SM4] - implement gcmSm4Init --- .travis.yml | 4 +- sm4/asm_amd64.s | 13 +++-- sm4/gcm_amd64.s | 139 +++++++++++++++++++++++++++++++++++++++++--- sm4/gcm_arm64.s | 106 ++++++++++++++++++++++++++++++--- sm4/sm4_gcm.go | 6 +- sm4/sm4_gcm_test.go | 7 +-- 6 files changed, 242 insertions(+), 33 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9a8a7a3..7d36bcb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,5 +23,5 @@ install: script: - go test -v ./... -after_success: - - go test -v -short -bench . -run=^$ ./... +#after_success: +# - go test -v -short -bench . -run=^$ ./... diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index caf1b83..c9c528c 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -17,9 +17,9 @@ DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b GLOBL flip_mask<>(SB), RODATA, $16 // shuffle byte and word order -DATA flip_mask2<>+0x00(SB)/8, $0x08090a0b0c0d0e0f -DATA flip_mask2<>+0x08(SB)/8, $0x0001020304050607 -GLOBL flip_mask2<>(SB), RODATA, $16 +DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607 +GLOBL bswap_mask<>(SB), RODATA, $16 //nibble mask DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F @@ -408,6 +408,7 @@ avx2: CMPQ DI, $64 JBE avx2_4blocks +avx2_8blocks: VMOVDQU 0(DX), XDWORD0 VMOVDQU 32(DX), XDWORD1 VMOVDQU 64(DX), XDWORD2 @@ -438,7 +439,7 @@ avx2_loop: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - VBROADCASTI128 flip_mask2<>(SB), BYTE_FLIP_MASK + VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 @@ -481,7 +482,7 @@ avx_loop: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XWORD0, XWORD1, XWORD2, XWORD3, XWTMP1, XWTMP2) - VMOVDQU flip_mask2<>(SB), X_BYTE_FLIP_MASK + VMOVDQU bswap_mask<>(SB), X_BYTE_FLIP_MASK VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 @@ -538,4 +539,4 @@ loop: PEXTRD $0, t0, R8 MOVL R8, 12(BX) done_sm4: - RET + RET diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 3da26ab..6d4a946 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -26,6 +26,45 @@ #define POLY X14 #define BSWAP X15 +// shuffle byte order from LE to BE +DATA flipMask<>+0x00(SB)/8, $0x0405060700010203 +DATA flipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b + +//nibble mask +DATA nibbleMask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA nibbleMask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F + +// inverse shift rows +DATA inverseShiftRows<>+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA inverseShiftRows<>+0x08(SB)/8, $0x0306090C0F020508 + +// Affine transform 1 (low and high hibbles) +DATA m1Low<>+0x00(SB)/8, $0x9197E2E474720701 +DATA m1Low<>+0x08(SB)/8, $0xC7C1B4B222245157 + +DATA m1High<>+0x00(SB)/8, $0xE240AB09EB49A200 +DATA m1High<>+0x08(SB)/8, $0xF052B91BF95BB012 + +// Affine transform 2 (low and high hibbles) +DATA m2Low<>+0x00(SB)/8, $0x5B67F2CEA19D0834 +DATA m2Low<>+0x08(SB)/8, $0xEDD14478172BBE82 + +DATA m2High<>+0x00(SB)/8, $0xAE7201DD73AFDC00 +DATA m2High<>+0x08(SB)/8, $0x11CDBE62CC1063BF + +// left rotations of 32-bit words by 8-bit increments +DATA r08Mask<>+0x00(SB)/8, $0x0605040702010003 +DATA r08Mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B + +DATA r16Mask<>+0x00(SB)/8, $0x0504070601000302 +DATA r16Mask<>+0x08(SB)/8, $0x0D0C0F0E09080B0A + +DATA r24Mask<>+0x00(SB)/8, $0x0407060500030201 +DATA r24Mask<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 + +DATA fkMask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 +DATA fkMask<>+0x08(SB)/8, $0xb27022dc677d9197 + DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 @@ -63,6 +102,17 @@ DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff +GLOBL flipMask<>(SB), (NOPTR+RODATA), $16 +GLOBL nibbleMask<>(SB), (NOPTR+RODATA), $16 +GLOBL inverseShiftRows<>(SB), (NOPTR+RODATA), $16 +GLOBL m1Low<>(SB), (NOPTR+RODATA), $16 +GLOBL m1High<>(SB), (NOPTR+RODATA), $16 +GLOBL m2Low<>(SB), (NOPTR+RODATA), $16 +GLOBL m2High<>(SB), (NOPTR+RODATA), $16 +GLOBL r08Mask<>(SB), (NOPTR+RODATA), $16 +GLOBL r16Mask<>(SB), (NOPTR+RODATA), $16 +GLOBL r24Mask<>(SB), (NOPTR+RODATA), $16 +GLOBL fkMask<>(SB), (NOPTR+RODATA), $16 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 GLOBL andMask<>(SB), (NOPTR+RODATA), $240 @@ -137,19 +187,92 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #undef plen #undef dlen -// func precomputeTableAsm(productTable *[256]byte, src *[16]byte) -TEXT ·precomputeTableAsm(SB),NOSPLIT,$0 +#define SM4_SBOX(x, y, z) \ + ; \ //############################# inner affine ############################// + MOVOU x, z; \ + PAND nibbleMask<>(SB), z; \ //y = _mm_and_si128(x, c0f); + MOVOU m1Low<>(SB), y; \ + PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); + PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); + PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU m1High<>(SB), z; \ + PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); + MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); + PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; + ; \ // inverse ShiftRows + PSHUFB inverseShiftRows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); + AESENCLAST nibbleMask<>(SB), x; \ // AESNI instruction + ; \ //############################# outer affine ############################// + MOVOU x, z; \ + PANDN nibbleMask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); + MOVOU m2Low<>(SB), y; \ + PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) + PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); + PAND nibbleMask<>(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU m2High<>(SB), z; \ + PSHUFB x, z; \ + MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) + PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; + +#define SM4_TAO_L1(x, y, z) \ + SM4_SBOX(x, y, z); \ + ; \ //#################### 4 parallel L1 linear transforms ##################// + MOVOU x, y; \ + PSHUFB r08Mask<>(SB), y; \ //y = _mm_shuffle_epi8(x, r08) + PXOR x, y; \ //y = x xor _mm_shuffle_epi8(x, r08) + MOVOU x, z; \ + PSHUFB r16Mask<>(SB), z; \ + PXOR z, y; \ //y = x xor _mm_shuffle_epi8(x, r08) xor _mm_shuffle_epi8(x, r16) + MOVOU y, z; \ + PSLLL $2, z; \ + PSRLL $30, y; \ + POR z, y; \ //y = _mm_slli_epi32(y, 2) ^ _mm_srli_epi32(y, 30); + MOVOU x, z; \ + PSHUFB r24Mask<>(SB), z; \ + PXOR y, x; \ //x = x xor y + PXOR z, x //x = x xor y xor _mm_shuffle_epi8(x, r24); + +#define SM4_SINGLE_ROUND(index, RK, IND, x, y, z, t0, t1, t2, t3) \ + PINSRD $0, (index * 4)(RK)(IND*1), x; \ + PXOR t1, x; \ + PXOR t2, x; \ + PXOR t3, x; \ + SM4_TAO_L1(x, y, z); \ + PXOR x, t0 + +// func gcmSm4Init(productTable *[256]byte, rk []uint32) +TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define dst DI -#define SRC SI +#define RK SI MOVQ productTable+0(FP), dst - MOVQ src+8(FP), SRC + MOVQ rk+8(FP), RK - MOVOU bswapMask<>(SB), BSWAP MOVOU gcmPoly<>(SB), POLY - MOVOU (16*0)(SRC), B0 - PSHUFB BSWAP, B0 + // Encrypt block 0, with the sm4 round keys to generate the hash key H + PXOR B0, B0 + PXOR B1, B1 + PXOR B2, B2 + PXOR B3, B3 + XORL CX, CX + +sm4InitEncLoop: + SM4_SINGLE_ROUND(0, RK, CX, T0, T1, T2, B0, B1, B2, B3) + SM4_SINGLE_ROUND(1, RK, CX, T0, T1, T2, B1, B2, B3, B0) + SM4_SINGLE_ROUND(2, RK, CX, T0, T1, T2, B2, B3, B0, B1) + SM4_SINGLE_ROUND(3, RK, CX, T0, T1, T2, B3, B0, B1, B2) + + ADDL $16, CX + CMPL CX, $4*32 + JB sm4InitEncLoop + + PEXTRD $0, B1, R8 + PINSRD $1, R8, B0 + PEXTRD $0, B2, R8 + PINSRD $2, R8, B0 + PEXTRD $0, B3, R8 + PINSRD $3, R8, B0 // H * 2 PSHUFD $0xff, B0, T0 @@ -209,7 +332,7 @@ initLoop: RET -#undef SRC +#undef RK #undef dst // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index 4712187..d9d93ad 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -106,14 +106,86 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 #undef plen #undef dlen -// func precomputeTableAsm(productTable *[256]byte, src *[16]byte) -TEXT ·precomputeTableAsm(SB),NOSPLIT,$0 +#define SM4_SBOX(x, y, z, z1, z2) \ + VMOV $0x0F0F0F0F0F0F0F0F, z1.D2; \ + VAND x.B16, z1.B16, z2.B16; \ + MOVD $0x9197E2E474720701, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0xC7C1B4B222245157, R19; \ + VMOV R19, z.D[1]; \ + VTBL z2.B16, [z.B16], y.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, z1.B16, z2.B16; \ + MOVD $0xE240AB09EB49A200, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0xF052B91BF95BB012, R19; \ + VMOV R19, z.D[1]; \ + VTBL z2.B16, [z.B16], z2.B16; \ + VEOR y.B16, z2.B16, x.B16; \ + MOVD $0x0B0E0104070A0D00, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0x0306090C0F020508, R19; \ + VMOV R19, z.D[1]; \ + VTBL z.B16, [x.B16], x.B16; \ + AESE ZERO.B16, x.B16; \ + VAND x.B16, z1.B16, z2.B16; \ + MOVD $0x5B67F2CEA19D0834, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0xEDD14478172BBE82, R19; \ + VMOV R19, z.D[1]; \ + VTBL z2.B16, [z.B16], y.B16; \ + VUSHR $4, x.D2, x.D2; \ + VAND x.B16, z1.B16, z2.B16; \ + MOVD $0xAE7201DD73AFDC00, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0x11CDBE62CC1063BF, R19; \ + VMOV R19, z.D[1]; \ + VTBL z2.B16, [z.B16], z2.B16; \ + VEOR y.B16, z2.B16, x.B16 + +#define SM4_TAO_L1(x, y, z, z1, z2) \ + SM4_SBOX(x, y, z, z1, z2); \ + ; \ + MOVD $0x0605040702010003, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0x0E0D0C0F0A09080B, R19; \ + VMOV R19, z.D[1]; \ + VTBL z.B16, [x.B16], y.B16; \ + VEOR y.B16, x.B16, y.B16; \ + MOVD $0x0504070601000302, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0x0D0C0F0E09080B0A , R19; \ + VMOV R19, z.D[1]; \ + VTBL z.B16, [x.B16], z.B16; \ + VEOR z.B16, y.B16, y.B16; \ + VSHL $2, y.S4, z.S4; \ + VUSHR $30, y.S4, y.S4; \ + VORR y.B16, z.B16, y.B16; \ + MOVD $0x0407060500030201, R19; \ + VMOV R19, z.D[0]; \ + MOVD $0x0C0F0E0D080B0A09, R19; \ + VMOV R19, z.D[1]; \ + VTBL z.B16, [x.B16], z.B16; \ + VEOR z.B16, x.B16, x.B16; \ + VEOR y.B16, x.B16, x.B16 + +#define SM4_ROUND(RK, x, y, z, z1, z2, t0, t1, t2, t3) \ + MOVW.P 4(RK), R19; \ + VMOV R19, x.S4; \ + VEOR t1.B16, x.B16, x.B16; \ + VEOR t2.B16, x.B16, x.B16; \ + VEOR t3.B16, x.B16, x.B16; \ + SM4_TAO_L1(x, y, z, z1, z2); \ + VEOR x.B16, t0.B16, t0.B16 + +// func gcmSm4Init(productTable *[256]byte, rk []uint32) +TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define pTbl R0 -#define SRC R1 -#define I R3 +#define RK R1 +#define I R2 MOVD productTable+0(FP), pTbl - MOVD src+8(FP), SRC + MOVD rk+8(FP), RK MOVD $0xC2, I LSL $56, I @@ -122,8 +194,26 @@ TEXT ·precomputeTableAsm(SB),NOSPLIT,$0 VMOV I, POLY.D[1] VEOR ZERO.B16, ZERO.B16, ZERO.B16 - VLD1 (SRC), [B0.B16] - VREV64 B0.B16, B0.B16 + // Encrypt block 0 with the SM4 keys to generate the hash key H + VEOR B0.B16, B0.B16, B0.B16 + VEOR B1.B16, B1.B16, B1.B16 + VEOR B2.B16, B2.B16, B2.B16 + VEOR B3.B16, B3.B16, B3.B16 + EOR R3, R3 + +sm4InitEncLoop: + SM4_ROUND(RK, K0, K1, K2, K3, K4, B0, B1, B2, B3) + SM4_ROUND(RK, K0, K1, K2, K3, K4, B1, B2, B3, B0) + SM4_ROUND(RK, K0, K1, K2, K3, K4, B2, B3, B0, B1) + SM4_ROUND(RK, K0, K1, K2, K3, K4, B3, B0, B1, B2) + + ADD $16, R3 + CMP $128, R3 + BNE sm4InitEncLoop + + VMOV B1.S[0], B0.S[1] + VMOV B2.S[0], B0.S[2] + VMOV B3.S[0], B0.S[3] // Multiply by 2 modulo P VMOV B0.D[0], I @@ -181,7 +271,7 @@ initLoop: BNE initLoop RET #undef I -#undef SRC +#undef RK #undef pTbl // func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) diff --git a/sm4/sm4_gcm.go b/sm4/sm4_gcm.go index 2e277ab..3c13729 100644 --- a/sm4/sm4_gcm.go +++ b/sm4/sm4_gcm.go @@ -21,7 +21,7 @@ type sm4CipherGCM struct { var _ gcmAble = (*sm4CipherGCM)(nil) //go:noescape -func precomputeTableAsm(productTable *[256]byte, src *[16]byte) +func gcmSm4Init(productTable *[256]byte, rk []uint32) //go:noescape func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte) @@ -41,9 +41,7 @@ func (c *sm4CipherGCM) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) { g.cipher = &c.sm4CipherAsm g.nonceSize = nonceSize g.tagSize = tagSize - var key [gcmBlockSize]byte - c.Encrypt(key[:], key[:]) - precomputeTableAsm(&g.bytesProductTable, &key) + gcmSm4Init(&g.bytesProductTable, g.cipher.enc) return g, nil } diff --git a/sm4/sm4_gcm_test.go b/sm4/sm4_gcm_test.go index d892f65..75acca9 100644 --- a/sm4/sm4_gcm_test.go +++ b/sm4/sm4_gcm_test.go @@ -15,10 +15,7 @@ func genPrecomputeTable() *gcmAsm { c1 := &sm4CipherGCM{c} g := &gcmAsm{} g.cipher = &c1.sm4CipherAsm - var key1 [gcmBlockSize]byte - c1.Encrypt(key1[:], key1[:]) - fmt.Printf("%v\n", key1) - precomputeTableAsm(&g.bytesProductTable, &key1) + gcmSm4Init(&g.bytesProductTable, g.cipher.enc) return g } @@ -59,7 +56,7 @@ arm64 result = { 0xCD, 0x01, 0x2B, 0xA4, 0xF6, 0x8E, 0x45, 0x62, 0xCD, 0x01, 0x2B, 0xA4, 0xF6, 0x8E, 0x45, 0x62, } */ -func TestPrecomputeTableAsm(t *testing.T) { +func TestGcmSm4Init(t *testing.T) { g := genPrecomputeTable() for i := 0; i < 16; i++ { for j := 0; j < 16; j++ {