diff --git a/sm3/sm3_test.go b/sm3/sm3_test.go index 0a8db68..ea49666 100644 --- a/sm3/sm3_test.go +++ b/sm3/sm3_test.go @@ -422,7 +422,8 @@ func TestKdf(t *testing.T) { {"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"}, {"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"}, {"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"}, - {"sm3 case 8", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 516}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343ec8ab3d0cd9491a146e1b6033c512399bcd1cb9568d4f10d582f145c3ad7aae4ace7a14ec0abf831edc5aabcf58a1fb05180fa6e79651aa8753ddbf3ca0877b9a9d745ae1729b253f61cfc726cba4c9113008187830e41d428ca223014c994f317998689"}, + {"sm3 case 8", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 416}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343"}, + {"sm3 case 9", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 516}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343ec8ab3d0cd9491a146e1b6033c512399bcd1cb9568d4f10d582f145c3ad7aae4ace7a14ec0abf831edc5aabcf58a1fb05180fa6e79651aa8753ddbf3ca0877b9a9d745ae1729b253f61cfc726cba4c9113008187830e41d428ca223014c994f317998689"}, } for _, tt := range tests { wantBytes, _ := hex.DecodeString(tt.want) diff --git a/sm3/sm3blocks_avx2_amd64.s b/sm3/sm3blocks_avx2_amd64.s index 3a0d49d..257b2d3 100644 --- a/sm3/sm3blocks_avx2_amd64.s +++ b/sm3/sm3blocks_avx2_amd64.s @@ -81,12 +81,6 @@ GLOBL r08_mask<>(SB), 8, $32 VPERM2I128 $0x31, r6, tmp4, r6; \ // r6 = [w62, w54, w46, w38, w30, w22, w14, w6] VPERM2I128 $0x31, r7, tmp3, r7; \ // r7 = [w63, w55, w47, w39, w31, w23, w15, w7] -// xorm (mem), reg -// xor reg to mem using reg-mem xor and store -#define xorm(P1, P2) \ - VPXOR P1, P2, P2; \ - VMOVDQU P2, P1 - // store 256 bits #define storeWord(W, j) VMOVDQU W, (256+(j)*32)(BX) // load 256 bits @@ -329,6 +323,7 @@ loop: prepare8Words(0) prepare8Words(1) + // Need to load state again due to YMM registers are used in prepare8Words loadState ROUND_00_11(0, a, b, c, d, e, f, g, h) @@ -398,15 +393,19 @@ loop: ROUND_16_63(62, c, d, e, f, g, h, a, b) ROUND_16_63(63, b, c, d, e, f, g, h, a) - xorm( 0(BX), a) - xorm( 32(BX), b) - xorm( 64(BX), c) - xorm( 96(BX), d) - xorm( 128(BX), e) - xorm( 160(BX), f) - xorm( 192(BX), g) - xorm(224(BX), h) + VPXOR (0*32)(BX), a, a + VPXOR (1*32)(BX), b, b + VPXOR (2*32)(BX), c, c + VPXOR (3*32)(BX), d, d + VPXOR (4*32)(BX), e, e + VPXOR (5*32)(BX), f, f + VPXOR (6*32)(BX), g, g + VPXOR (7*32)(BX), h, h + DECQ DX + JZ end + + saveState LEAQ 64(srcPtr1), srcPtr1 LEAQ 64(srcPtr2), srcPtr2 LEAQ 64(srcPtr3), srcPtr3 @@ -416,9 +415,9 @@ loop: LEAQ 64(srcPtr7), srcPtr7 LEAQ 64(srcPtr8), srcPtr8 - DECQ DX - JNZ loop + JMP loop +end: TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4) // save state diff --git a/sm3/sm3blocks_simd_amd64.s b/sm3/sm3blocks_simd_amd64.s index e14a92f..07f0ab0 100644 --- a/sm3/sm3blocks_simd_amd64.s +++ b/sm3/sm3blocks_simd_amd64.s @@ -79,13 +79,6 @@ GLOBL r08_mask<>(SB), 8, $16 MOVOU g, 96(BX) \ MOVOU h, 112(BX) -// xorm (mem), reg -// Xor reg to mem using reg-mem xor and store -#define xorm(P1, P2) \ - MOVOU P1, tmp1; \ - PXOR tmp1, P2; \ - MOVOU P2, P1 - #define storeWord(W, j) MOVOU W, (128+(j)*16)(BX) #define loadWord(W, i) MOVOU (128+(i)*16)(BX), W @@ -235,12 +228,6 @@ GLOBL r08_mask<>(SB), 8, $16 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] -// avxXorm (mem), reg -// Xor reg to mem using reg-mem xor and store -#define avxXorm(P1, P2) \ - VPXOR P1, P2, P2; \ - VMOVDQU P2, P1 - #define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX) #define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W @@ -472,23 +459,34 @@ loop: ROUND_16_63(62, c, d, e, f, g, h, a, b) ROUND_16_63(63, b, c, d, e, f, g, h, a) - xorm( 0(BX), a) - xorm( 16(BX), b) - xorm( 32(BX), c) - xorm( 48(BX), d) - xorm( 64(BX), e) - xorm( 80(BX), f) - xorm( 96(BX), g) - xorm(112(BX), h) + MOVOU (0*16)(BX), tmp1 + PXOR tmp1, a + MOVOU (1*16)(BX), tmp1 + PXOR tmp1, b + MOVOU (2*16)(BX), tmp1 + PXOR tmp1, c + MOVOU (3*16)(BX), tmp1 + PXOR tmp1, d + MOVOU (4*16)(BX), tmp1 + PXOR tmp1, e + MOVOU (5*16)(BX), tmp1 + PXOR tmp1, f + MOVOU (6*16)(BX), tmp1 + PXOR tmp1, g + MOVOU (7*16)(BX), tmp1 + PXOR tmp1, h + DECQ DX + JZ end + + storeState LEAQ 64(R8), R8 LEAQ 64(R9), R9 LEAQ 64(R10), R10 LEAQ 64(R11), R11 + JMP loop - DECQ DX - JNZ loop - +end: // transpose state SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) @@ -616,23 +614,35 @@ avxLoop: AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a) - avxXorm( 0(BX), a) - avxXorm( 16(BX), b) - avxXorm( 32(BX), c) - avxXorm( 48(BX), d) - avxXorm( 64(BX), e) - avxXorm( 80(BX), f) - avxXorm( 96(BX), g) - avxXorm(112(BX), h) + VPXOR (0*16)(BX), a, a + VPXOR (1*16)(BX), b, b + VPXOR (2*16)(BX), c, c + VPXOR (3*16)(BX), d, d + VPXOR (4*16)(BX), e, e + VPXOR (5*16)(BX), f, f + VPXOR (6*16)(BX), g, g + VPXOR (7*16)(BX), h, h + + DECQ DX + JZ avxEnd + + // store current state + VMOVDQU a, (0*16)(BX) + VMOVDQU b, (1*16)(BX) + VMOVDQU c, (2*16)(BX) + VMOVDQU d, (3*16)(BX) + VMOVDQU e, (4*16)(BX) + VMOVDQU f, (5*16)(BX) + VMOVDQU g, (6*16)(BX) + VMOVDQU h, (7*16)(BX) LEAQ 64(R8), R8 LEAQ 64(R9), R9 LEAQ 64(R10), R10 LEAQ 64(R11), R11 + JMP avxLoop - DECQ DX - JNZ avxLoop - +avxEnd: // transpose state TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)