mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
xor: use avx2 if possible
This commit is contained in:
parent
941b782448
commit
c3847c504f
@ -7,6 +7,10 @@
|
|||||||
|
|
||||||
package subtle
|
package subtle
|
||||||
|
|
||||||
|
import "golang.org/x/sys/cpu"
|
||||||
|
|
||||||
|
var useAVX2 = cpu.X86.HasAVX2
|
||||||
|
|
||||||
// XORBytes xors the bytes in a and b. The destination should have enough
|
// XORBytes xors the bytes in a and b. The destination should have enough
|
||||||
// space, otherwise XORBytes will panic. Returns the number of bytes xor'd.
|
// space, otherwise XORBytes will panic. Returns the number of bytes xor'd.
|
||||||
func XORBytes(dst, a, b []byte) int {
|
func XORBytes(dst, a, b []byte) int {
|
||||||
|
@ -13,6 +13,12 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
|
|||||||
MOVQ a+8(FP), SI
|
MOVQ a+8(FP), SI
|
||||||
MOVQ b+16(FP), CX
|
MOVQ b+16(FP), CX
|
||||||
MOVQ n+24(FP), DX
|
MOVQ n+24(FP), DX
|
||||||
|
CMPQ DX, $32 // if len less than 32, non avx2.
|
||||||
|
JL non_avx2
|
||||||
|
CMPB ·useAVX2(SB), $1
|
||||||
|
JE avx2
|
||||||
|
|
||||||
|
non_avx2:
|
||||||
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
|
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
|
||||||
JNZ not_aligned
|
JNZ not_aligned
|
||||||
|
|
||||||
@ -55,3 +61,63 @@ not_aligned:
|
|||||||
|
|
||||||
ret:
|
ret:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
avx2:
|
||||||
|
TESTQ $31, DX // AND 31 & len, if not zero jump to not_aligned.
|
||||||
|
JNZ avx2_not_aligned
|
||||||
|
|
||||||
|
avx2_aligned:
|
||||||
|
TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start.
|
||||||
|
JE loop32b_start
|
||||||
|
SUBQ $16, DX // XOR 16bytes backwards.
|
||||||
|
MOVOU (SI)(DX*1), X0
|
||||||
|
MOVOU (CX)(DX*1), X1
|
||||||
|
PXOR X1, X0
|
||||||
|
MOVOU X0, (BX)(DX*1)
|
||||||
|
CMPQ DX, $0 // if len is 0, ret.
|
||||||
|
JE avx2_ret
|
||||||
|
|
||||||
|
loop32b_start:
|
||||||
|
MOVQ $0, AX // position in slices
|
||||||
|
|
||||||
|
loop32b:
|
||||||
|
VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards.
|
||||||
|
VMOVDQU (CX)(AX*1), Y1
|
||||||
|
VPXOR Y0, Y1, Y0
|
||||||
|
VMOVDQU Y0, (BX)(AX*1)
|
||||||
|
ADDQ $32, AX
|
||||||
|
CMPQ DX, AX
|
||||||
|
JNE loop32b
|
||||||
|
VZEROUPPER
|
||||||
|
RET
|
||||||
|
|
||||||
|
avx2_loop_1b:
|
||||||
|
SUBQ $1, DX // XOR 1byte backwards.
|
||||||
|
MOVB (SI)(DX*1), DI
|
||||||
|
MOVB (CX)(DX*1), AX
|
||||||
|
XORB AX, DI
|
||||||
|
MOVB DI, (BX)(DX*1)
|
||||||
|
TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b.
|
||||||
|
JNZ avx2_loop_1b
|
||||||
|
CMPQ DX, $0 // if len is 0, ret.
|
||||||
|
JE avx2_ret
|
||||||
|
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
|
||||||
|
JZ avx2_aligned
|
||||||
|
|
||||||
|
avx2_not_aligned:
|
||||||
|
TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b.
|
||||||
|
JNE avx2_loop_1b
|
||||||
|
TESTQ $8, DX // AND $8 & len, if zero jump to avx2_16b.
|
||||||
|
JE avx2_16b
|
||||||
|
SUBQ $8, DX // XOR 8bytes backwards.
|
||||||
|
MOVQ (SI)(DX*1), DI
|
||||||
|
MOVQ (CX)(DX*1), AX
|
||||||
|
XORQ AX, DI
|
||||||
|
MOVQ DI, (BX)(DX*1)
|
||||||
|
avx2_16b:
|
||||||
|
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
|
||||||
|
JGE avx2_aligned
|
||||||
|
|
||||||
|
avx2_ret:
|
||||||
|
VZEROUPPER
|
||||||
|
RET
|
||||||
|
@ -64,7 +64,7 @@ func BenchmarkXORBytes(b *testing.B) {
|
|||||||
dst := make([]byte, 1<<15)
|
dst := make([]byte, 1<<15)
|
||||||
data0 := make([]byte, 1<<15)
|
data0 := make([]byte, 1<<15)
|
||||||
data1 := make([]byte, 1<<15)
|
data1 := make([]byte, 1<<15)
|
||||||
sizes := []int64{1 << 3, 1 << 7, 1 << 11, 1 << 15}
|
sizes := []int64{1 << 3, 1 << 4, 1 << 5, 1 << 7, 1 << 11, 1 << 15}
|
||||||
for _, size := range sizes {
|
for _, size := range sizes {
|
||||||
b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
|
b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
|
||||||
s0 := data0[:size]
|
s0 := data0[:size]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user