internal/subtle: optimize amd64

This commit is contained in:
Sun Yimin 2024-04-01 08:53:21 +08:00 committed by GitHub
parent df85a7f623
commit ef0d1a2fe0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -24,6 +24,7 @@ non_avx2:
aligned: aligned:
MOVQ $0, AX // position in slices MOVQ $0, AX // position in slices
PCALIGN $16
loop16b: loop16b:
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
MOVOU (CX)(AX*1), X1 MOVOU (CX)(AX*1), X1
@ -34,6 +35,7 @@ loop16b:
JNE loop16b JNE loop16b
RET RET
PCALIGN $16
loop_1b: loop_1b:
SUBQ $1, DX // XOR 1byte backwards. SUBQ $1, DX // XOR 1byte backwards.
MOVB (SI)(DX*1), DI MOVB (SI)(DX*1), DI
@ -62,34 +64,34 @@ ret:
RET RET
avx2: avx2:
TESTQ $31, DX // AND 31 & len, if not zero jump to avx2_not_aligned. TESTQ $31, DX // AND 31 & len, if not zero jump to avx2_not_aligned.
JNZ avx2_not_aligned JNZ avx2_not_aligned
avx2_aligned: avx2_aligned: // input length = 16*n, where n is greater or equal 2.
TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start. TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start.
JE loop32b_start JE loop32b_start
SUBQ $16, DX // XOR 16bytes backwards. SUBQ $16, DX // XOR 16bytes backwards.
MOVOU (SI)(DX*1), X0 VMOVDQU (SI)(DX*1), X0
MOVOU (CX)(DX*1), X1 VPXOR (CX)(DX*1), X0, X0
PXOR X1, X0 VMOVDQU X0, (BX)(DX*1)
MOVOU X0, (BX)(DX*1)
CMPQ DX, $0 // if len is 0, ret.
JE avx2_ret
loop32b_start: loop32b_start:
MOVQ $0, AX // position in slices MOVQ $0, AX // position in slices
PCALIGN $32
loop32b: loop32b:
VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards. VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards.
VMOVDQU (CX)(AX*1), Y1 VPXOR (CX)(AX*1), Y0, Y0
VPXOR Y0, Y1, Y0
VMOVDQU Y0, (BX)(AX*1) VMOVDQU Y0, (BX)(AX*1)
ADDQ $32, AX ADDQ $32, AX
CMPQ DX, AX CMPQ DX, AX
JNE loop32b JNE loop32b
avx2_ret:
VZEROUPPER VZEROUPPER
RET RET
PCALIGN $16
avx2_loop_1b: avx2_loop_1b:
SUBQ $1, DX // XOR 1byte backwards. SUBQ $1, DX // XOR 1byte backwards.
MOVB (SI)(DX*1), DI MOVB (SI)(DX*1), DI
@ -98,25 +100,17 @@ avx2_loop_1b:
MOVB DI, (BX)(DX*1) MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b. TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b.
JNZ avx2_loop_1b JNZ avx2_loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE avx2_ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned. TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ avx2_aligned JZ avx2_aligned
avx2_not_aligned: avx2_not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b. TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b.
JNE avx2_loop_1b JNE avx2_loop_1b
TESTQ $8, DX // AND $8 & len, if zero jump to avx2_16b. TESTQ $8, DX // AND $8 & len, if zero jump to avx2_aligned.
JE avx2_16b JE avx2_aligned
SUBQ $8, DX // XOR 8bytes backwards. SUBQ $8, DX // XOR 8bytes backwards.
MOVQ (SI)(DX*1), DI MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX MOVQ (CX)(DX*1), AX
XORQ AX, DI XORQ AX, DI
MOVQ DI, (BX)(DX*1) MOVQ DI, (BX)(DX*1)
avx2_16b: JMP avx2_aligned
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE avx2_aligned
avx2_ret:
VZEROUPPER
RET