diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index b5038b1..a73abc7 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -8,7 +8,16 @@ #define t3 V5 #define ZERO V16 #define FLIP_MASK V17 - +#define NIBBLE_MASK V20 +#define INVERSE_SHIFT_ROWS V30 +#define M1L V22 +#define M1H V23 +#define M2L V24 +#define M2H V25 +#define R08_MASK V26 +#define R16_MASK V27 +#define R24_MASK V28 +#define FK_MASK V29 #define XTMP6 V6 #define XTMP7 V7 @@ -23,8 +32,8 @@ DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F GLOBL nibble_mask<>(SB), (NOPTR+RODATA), $16 // inverse shift rows -DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 +DATA inverse_shift_rows<>+0x00(SB)/8, $0x0106050403020700 +DATA inverse_shift_rows<>+0x08(SB)/8, $0x0F0E0D0C0B0A0908 GLOBL inverse_shift_rows<>(SB), (NOPTR+RODATA), $16 // Affine transform 1 (low and high hibbles) @@ -64,37 +73,19 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 #define SM4_SBOX(x, y) \ ; \ //############################# inner affine ############################// - LDP nibble_mask<>(SB), (R0, R1); \ - VMOV R0, XTMP6.D[0]; \ - VMOV R1, XTMP6.D[1]; \ - VAND x.B16, XTMP6.B16, XTMP7.B16; \ - LDP m1_low<>(SB), (R0, R1); \ - VMOV R0, y.D[0]; \ - VMOV R1, y.D[1]; \ - VTBL XTMP7.B16, [y.B16], y.B16; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M1L.B16], y.B16; \ VUSHR $4, x.D2, x.D2; \ - VAND x.B16, XTMP6.B16, XTMP7.B16; \ - LDP m1_low<>(SB), (R0, R1); \ - VMOV R0, V8.D[0]; \ - VMOV R1, V8.D[1]; \ - VTBL XTMP7.B16, [V8.B16], XTMP7.B16; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M1H.B16], XTMP7.B16; \ VEOR y.B16, XTMP7.B16, x.B16; \ - LDP inverse_shift_rows<>(SB), (R0, R1); \ - VMOV R0, V8.D[0]; \ - VMOV R1, V8.D[1]; \ - VTBL V8.B16, [x.B16], x.B16; \ + VTBL INVERSE_SHIFT_ROWS.B16, [x.B16], x.B16; \ AESE ZERO.B16, x.B16; \ - VAND x.B16, XTMP6.B16, XTMP7.B16; \ - LDP m2_low<>(SB), (R0, R1); \ - VMOV R0, y.D[0]; \ - VMOV R1, y.D[1]; \ - VTBL XTMP7.B16, [y.B16], y.B16; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M2L.B16], y.B16; \ VUSHR $4, x.D2, x.D2; \ - VAND x.B16, XTMP6.B16, XTMP7.B16; \ - LDP m2_high<>(SB), (R0, R1); \ - VMOV R0, V8.D[0]; \ - VMOV R1, V8.D[1]; \ - VTBL XTMP7.B16, [V8.B16], XTMP7.B16; \ + VAND x.B16, NIBBLE_MASK.B16, XTMP7.B16; \ + VTBL XTMP7.B16, [M2H.B16], XTMP7.B16; \ VEOR y.B16, XTMP7.B16, x.B16 #define SM4_TAO_L1(x, y) \ @@ -121,11 +112,7 @@ GLOBL fk_mask<>(SB), (NOPTR+RODATA), $16 VEOR y.B16, x.B16, x.B16 #define SM4_TAO_L2(x, y) \ - VMOV R0, XTMP6.D[0]; \ - VMOV R1, XTMP6.D[1]; \ - VAND x.B16, XTMP6.B16, XTMP7.B16; \ - VMOV R0, y.D[0]; \ - VMOV R1, y.D[1]; \ + SM4_SBOX(x, y); \ ; \ //#################### 4 parallel L2 linear transforms ##################// VSHL $13, x.S4, XTMP6.S4; \ VUSHR $19, x.S4, y.S4; \ @@ -142,16 +129,42 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVD ck+8(FP), R9 MOVD enc+16(FP), R10 MOVD dec+24(FP), R11 - - VLD1 (R8), [t0.B16]; + LDP flip_mask<>(SB), (R0, R1) VMOV R0, FLIP_MASK.D[0] VMOV R1, FLIP_MASK.D[1] - VTBL FLIP_MASK.B16, [t0.B16], t0.B16 + + LDP nibble_mask<>(SB), (R0, R1) + VMOV R0, NIBBLE_MASK.D[0] + VMOV R1, NIBBLE_MASK.D[1] + + LDP m1_low<>(SB), (R0, R1) + VMOV R0, M1L.D[0] + VMOV R1, M1L.D[1] + + LDP m1_high<>(SB), (R0, R1) + VMOV R0, M1H.D[0] + VMOV R1, M1H.D[1] + + LDP m2_low<>(SB), (R0, R1) + VMOV R0, M2L.D[0] + VMOV R1, M2L.D[1] + + LDP m2_high<>(SB), (R0, R1) + VMOV R0, M2H.D[0] + VMOV R1, M2H.D[1] + LDP fk_mask<>(SB), (R0, R1) - VMOV R0, XTMP7.D[0] - VMOV R1, XTMP7.D[1] - VEOR t0.B16, XTMP7.B16, t0.B16 + VMOV R0, FK_MASK.D[0] + VMOV R1, FK_MASK.D[1] + + LDP inverse_shift_rows<>(SB), (R0, R1) + VMOV R0, INVERSE_SHIFT_ROWS.D[0] + VMOV R1, INVERSE_SHIFT_ROWS.D[1] + + VLD1 (R8), [t0.B16]; + VTBL FLIP_MASK.B16, [t0.B16], t0.B16 + VEOR t0.B16, FK_MASK.B16, t0.B16 VMOV t0.S[1], t1.S[0] VMOV t0.S[2], t2.S[0] VMOV t0.S[3], t3.S[0] @@ -208,6 +221,7 @@ ksLoop: ADD $16, R0 CMP $128, R0 BNE ksLoop + RET // func encryptBlocksAsm(xk *uint32, dst, src *byte) diff --git a/sm4/cipher_asm_fuzzy_test.go b/sm4/cipher_asm_fuzzy_test.go index a390971..0ade790 100644 --- a/sm4/cipher_asm_fuzzy_test.go +++ b/sm4/cipher_asm_fuzzy_test.go @@ -4,9 +4,7 @@ package sm4 import ( - "crypto/rand" "fmt" - "io" "testing" ) @@ -46,15 +44,16 @@ func TestExpandKey(t *testing.T) { */ func TestExpandKeySimple(t *testing.T) { - key := make([]byte, 16) + key := []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10} encRes1 := make([]uint32, 32) decRes1 := make([]uint32, 32) encRes2 := make([]uint32, 32) decRes2 := make([]uint32, 32) - io.ReadFull(rand.Reader, key) + expandKeyGo(key, encRes1, decRes1) expandKeyAsm(&key[0], &ck[0], &encRes2[0], &decRes2[0]) fmt.Printf("expected=%v, result=%v\n", encRes1, encRes2) fmt.Printf("expected=%v, result=%v\n", decRes1, decRes2) } +