kdf-sm3: add (8+4+1) test cases and reduce last round instructions

This commit is contained in:
Sun Yimin 2024-05-24 17:44:17 +08:00 committed by GitHub
parent 3ede319900
commit 238c0a3634
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 62 additions and 52 deletions

View File

@ -422,7 +422,8 @@ func TestKdf(t *testing.T) {
{"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"}, {"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"},
{"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"}, {"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"},
{"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"}, {"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"},
{"sm3 case 8", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 516}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343ec8ab3d0cd9491a146e1b6033c512399bcd1cb9568d4f10d582f145c3ad7aae4ace7a14ec0abf831edc5aabcf58a1fb05180fa6e79651aa8753ddbf3ca0877b9a9d745ae1729b253f61cfc726cba4c9113008187830e41d428ca223014c994f317998689"}, {"sm3 case 8", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 416}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343"},
{"sm3 case 9", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 516}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a93f26fd6c99111b3017db1d1e6025d28d88ed3a419eb9c72e4fa3267f19c806092fd80cb91079cc00cefc55db53ad840ed1e6384f4cf02d9f2ecbaed54391e7a6da71fca4ea53ccfdd4d85adf37e4be8af1324f43ee402f109ac6a77915fd7e248d3f14f3698dd0e8ea7ea27e4288b288d75b4343ec8ab3d0cd9491a146e1b6033c512399bcd1cb9568d4f10d582f145c3ad7aae4ace7a14ec0abf831edc5aabcf58a1fb05180fa6e79651aa8753ddbf3ca0877b9a9d745ae1729b253f61cfc726cba4c9113008187830e41d428ca223014c994f317998689"},
} }
for _, tt := range tests { for _, tt := range tests {
wantBytes, _ := hex.DecodeString(tt.want) wantBytes, _ := hex.DecodeString(tt.want)

View File

@ -81,12 +81,6 @@ GLOBL r08_mask<>(SB), 8, $32
VPERM2I128 $0x31, r6, tmp4, r6; \ // r6 = [w62, w54, w46, w38, w30, w22, w14, w6] VPERM2I128 $0x31, r6, tmp4, r6; \ // r6 = [w62, w54, w46, w38, w30, w22, w14, w6]
VPERM2I128 $0x31, r7, tmp3, r7; \ // r7 = [w63, w55, w47, w39, w31, w23, w15, w7] VPERM2I128 $0x31, r7, tmp3, r7; \ // r7 = [w63, w55, w47, w39, w31, w23, w15, w7]
// xorm (mem), reg
// xor reg to mem using reg-mem xor and store
#define xorm(P1, P2) \
VPXOR P1, P2, P2; \
VMOVDQU P2, P1
// store 256 bits // store 256 bits
#define storeWord(W, j) VMOVDQU W, (256+(j)*32)(BX) #define storeWord(W, j) VMOVDQU W, (256+(j)*32)(BX)
// load 256 bits // load 256 bits
@ -329,6 +323,7 @@ loop:
prepare8Words(0) prepare8Words(0)
prepare8Words(1) prepare8Words(1)
// Need to load state again due to YMM registers are used in prepare8Words
loadState loadState
ROUND_00_11(0, a, b, c, d, e, f, g, h) ROUND_00_11(0, a, b, c, d, e, f, g, h)
@ -398,15 +393,19 @@ loop:
ROUND_16_63(62, c, d, e, f, g, h, a, b) ROUND_16_63(62, c, d, e, f, g, h, a, b)
ROUND_16_63(63, b, c, d, e, f, g, h, a) ROUND_16_63(63, b, c, d, e, f, g, h, a)
xorm( 0(BX), a) VPXOR (0*32)(BX), a, a
xorm( 32(BX), b) VPXOR (1*32)(BX), b, b
xorm( 64(BX), c) VPXOR (2*32)(BX), c, c
xorm( 96(BX), d) VPXOR (3*32)(BX), d, d
xorm( 128(BX), e) VPXOR (4*32)(BX), e, e
xorm( 160(BX), f) VPXOR (5*32)(BX), f, f
xorm( 192(BX), g) VPXOR (6*32)(BX), g, g
xorm(224(BX), h) VPXOR (7*32)(BX), h, h
DECQ DX
JZ end
saveState
LEAQ 64(srcPtr1), srcPtr1 LEAQ 64(srcPtr1), srcPtr1
LEAQ 64(srcPtr2), srcPtr2 LEAQ 64(srcPtr2), srcPtr2
LEAQ 64(srcPtr3), srcPtr3 LEAQ 64(srcPtr3), srcPtr3
@ -416,9 +415,9 @@ loop:
LEAQ 64(srcPtr7), srcPtr7 LEAQ 64(srcPtr7), srcPtr7
LEAQ 64(srcPtr8), srcPtr8 LEAQ 64(srcPtr8), srcPtr8
DECQ DX JMP loop
JNZ loop
end:
TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4) TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4)
// save state // save state

View File

@ -79,13 +79,6 @@ GLOBL r08_mask<>(SB), 8, $16
MOVOU g, 96(BX) \ MOVOU g, 96(BX) \
MOVOU h, 112(BX) MOVOU h, 112(BX)
// xorm (mem), reg
// Xor reg to mem using reg-mem xor and store
#define xorm(P1, P2) \
MOVOU P1, tmp1; \
PXOR tmp1, P2; \
MOVOU P2, P1
#define storeWord(W, j) MOVOU W, (128+(j)*16)(BX) #define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
#define loadWord(W, i) MOVOU (128+(i)*16)(BX), W #define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
@ -235,12 +228,6 @@ GLOBL r08_mask<>(SB), 8, $16
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
// avxXorm (mem), reg
// Xor reg to mem using reg-mem xor and store
#define avxXorm(P1, P2) \
VPXOR P1, P2, P2; \
VMOVDQU P2, P1
#define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX) #define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
#define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W #define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
@ -472,23 +459,34 @@ loop:
ROUND_16_63(62, c, d, e, f, g, h, a, b) ROUND_16_63(62, c, d, e, f, g, h, a, b)
ROUND_16_63(63, b, c, d, e, f, g, h, a) ROUND_16_63(63, b, c, d, e, f, g, h, a)
xorm( 0(BX), a) MOVOU (0*16)(BX), tmp1
xorm( 16(BX), b) PXOR tmp1, a
xorm( 32(BX), c) MOVOU (1*16)(BX), tmp1
xorm( 48(BX), d) PXOR tmp1, b
xorm( 64(BX), e) MOVOU (2*16)(BX), tmp1
xorm( 80(BX), f) PXOR tmp1, c
xorm( 96(BX), g) MOVOU (3*16)(BX), tmp1
xorm(112(BX), h) PXOR tmp1, d
MOVOU (4*16)(BX), tmp1
PXOR tmp1, e
MOVOU (5*16)(BX), tmp1
PXOR tmp1, f
MOVOU (6*16)(BX), tmp1
PXOR tmp1, g
MOVOU (7*16)(BX), tmp1
PXOR tmp1, h
DECQ DX
JZ end
storeState
LEAQ 64(R8), R8 LEAQ 64(R8), R8
LEAQ 64(R9), R9 LEAQ 64(R9), R9
LEAQ 64(R10), R10 LEAQ 64(R10), R10
LEAQ 64(R11), R11 LEAQ 64(R11), R11
JMP loop
DECQ DX end:
JNZ loop
// transpose state // transpose state
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
@ -616,23 +614,35 @@ avxLoop:
AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b) AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a) AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a)
avxXorm( 0(BX), a) VPXOR (0*16)(BX), a, a
avxXorm( 16(BX), b) VPXOR (1*16)(BX), b, b
avxXorm( 32(BX), c) VPXOR (2*16)(BX), c, c
avxXorm( 48(BX), d) VPXOR (3*16)(BX), d, d
avxXorm( 64(BX), e) VPXOR (4*16)(BX), e, e
avxXorm( 80(BX), f) VPXOR (5*16)(BX), f, f
avxXorm( 96(BX), g) VPXOR (6*16)(BX), g, g
avxXorm(112(BX), h) VPXOR (7*16)(BX), h, h
DECQ DX
JZ avxEnd
// store current state
VMOVDQU a, (0*16)(BX)
VMOVDQU b, (1*16)(BX)
VMOVDQU c, (2*16)(BX)
VMOVDQU d, (3*16)(BX)
VMOVDQU e, (4*16)(BX)
VMOVDQU f, (5*16)(BX)
VMOVDQU g, (6*16)(BX)
VMOVDQU h, (7*16)(BX)
LEAQ 64(R8), R8 LEAQ 64(R8), R8
LEAQ 64(R9), R9 LEAQ 64(R9), R9
LEAQ 64(R10), R10 LEAQ 64(R10), R10
LEAQ 64(R11), R11 LEAQ 64(R11), R11
JMP avxLoop
DECQ DX avxEnd:
JNZ avxLoop
// transpose state // transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)