xor: use avx2 if possible

This commit is contained in:
emmansun 2023-05-13 10:26:44 +08:00
parent 941b782448
commit c3847c504f
3 changed files with 71 additions and 1 deletions

View File

@ -7,6 +7,10 @@
package subtle package subtle
import "golang.org/x/sys/cpu"
var useAVX2 = cpu.X86.HasAVX2
// XORBytes xors the bytes in a and b. The destination should have enough // XORBytes xors the bytes in a and b. The destination should have enough
// space, otherwise XORBytes will panic. Returns the number of bytes xor'd. // space, otherwise XORBytes will panic. Returns the number of bytes xor'd.
func XORBytes(dst, a, b []byte) int { func XORBytes(dst, a, b []byte) int {

View File

@ -13,6 +13,12 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
MOVQ a+8(FP), SI MOVQ a+8(FP), SI
MOVQ b+16(FP), CX MOVQ b+16(FP), CX
MOVQ n+24(FP), DX MOVQ n+24(FP), DX
CMPQ DX, $32 // if len less than 32, non avx2.
JL non_avx2
CMPB ·useAVX2(SB), $1
JE avx2
non_avx2:
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
JNZ not_aligned JNZ not_aligned
@ -55,3 +61,63 @@ not_aligned:
ret: ret:
RET RET
avx2:
TESTQ $31, DX // AND 31 & len, if not zero jump to not_aligned.
JNZ avx2_not_aligned
avx2_aligned:
TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start.
JE loop32b_start
SUBQ $16, DX // XOR 16bytes backwards.
MOVOU (SI)(DX*1), X0
MOVOU (CX)(DX*1), X1
PXOR X1, X0
MOVOU X0, (BX)(DX*1)
CMPQ DX, $0 // if len is 0, ret.
JE avx2_ret
loop32b_start:
MOVQ $0, AX // position in slices
loop32b:
VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards.
VMOVDQU (CX)(AX*1), Y1
VPXOR Y0, Y1, Y0
VMOVDQU Y0, (BX)(AX*1)
ADDQ $32, AX
CMPQ DX, AX
JNE loop32b
VZEROUPPER
RET
avx2_loop_1b:
SUBQ $1, DX // XOR 1byte backwards.
MOVB (SI)(DX*1), DI
MOVB (CX)(DX*1), AX
XORB AX, DI
MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b.
JNZ avx2_loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE avx2_ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ avx2_aligned
avx2_not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b.
JNE avx2_loop_1b
TESTQ $8, DX // AND $8 & len, if zero jump to avx2_16b.
JE avx2_16b
SUBQ $8, DX // XOR 8bytes backwards.
MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX
XORQ AX, DI
MOVQ DI, (BX)(DX*1)
avx2_16b:
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE avx2_aligned
avx2_ret:
VZEROUPPER
RET

View File

@ -64,7 +64,7 @@ func BenchmarkXORBytes(b *testing.B) {
dst := make([]byte, 1<<15) dst := make([]byte, 1<<15)
data0 := make([]byte, 1<<15) data0 := make([]byte, 1<<15)
data1 := make([]byte, 1<<15) data1 := make([]byte, 1<<15)
sizes := []int64{1 << 3, 1 << 7, 1 << 11, 1 << 15} sizes := []int64{1 << 3, 1 << 4, 1 << 5, 1 << 7, 1 << 11, 1 << 15}
for _, size := range sizes { for _, size := range sizes {
b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) { b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
s0 := data0[:size] s0 := data0[:size]