mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 12:16:20 +08:00
internal/subtle: optimize amd64
This commit is contained in:
parent
df85a7f623
commit
ef0d1a2fe0
@ -24,6 +24,7 @@ non_avx2:
|
|||||||
aligned:
|
aligned:
|
||||||
MOVQ $0, AX // position in slices
|
MOVQ $0, AX // position in slices
|
||||||
|
|
||||||
|
PCALIGN $16
|
||||||
loop16b:
|
loop16b:
|
||||||
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
|
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
|
||||||
MOVOU (CX)(AX*1), X1
|
MOVOU (CX)(AX*1), X1
|
||||||
@ -34,6 +35,7 @@ loop16b:
|
|||||||
JNE loop16b
|
JNE loop16b
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
PCALIGN $16
|
||||||
loop_1b:
|
loop_1b:
|
||||||
SUBQ $1, DX // XOR 1byte backwards.
|
SUBQ $1, DX // XOR 1byte backwards.
|
||||||
MOVB (SI)(DX*1), DI
|
MOVB (SI)(DX*1), DI
|
||||||
@ -62,34 +64,34 @@ ret:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
TESTQ $31, DX // AND 31 & len, if not zero jump to avx2_not_aligned.
|
TESTQ $31, DX // AND 31 & len, if not zero jump to avx2_not_aligned.
|
||||||
JNZ avx2_not_aligned
|
JNZ avx2_not_aligned
|
||||||
|
|
||||||
avx2_aligned:
|
avx2_aligned: // input length = 16*n, where n is greater or equal 2.
|
||||||
TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start.
|
TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start.
|
||||||
JE loop32b_start
|
JE loop32b_start
|
||||||
SUBQ $16, DX // XOR 16bytes backwards.
|
SUBQ $16, DX // XOR 16bytes backwards.
|
||||||
MOVOU (SI)(DX*1), X0
|
VMOVDQU (SI)(DX*1), X0
|
||||||
MOVOU (CX)(DX*1), X1
|
VPXOR (CX)(DX*1), X0, X0
|
||||||
PXOR X1, X0
|
VMOVDQU X0, (BX)(DX*1)
|
||||||
MOVOU X0, (BX)(DX*1)
|
|
||||||
CMPQ DX, $0 // if len is 0, ret.
|
|
||||||
JE avx2_ret
|
|
||||||
|
|
||||||
loop32b_start:
|
loop32b_start:
|
||||||
MOVQ $0, AX // position in slices
|
MOVQ $0, AX // position in slices
|
||||||
|
|
||||||
|
PCALIGN $32
|
||||||
loop32b:
|
loop32b:
|
||||||
VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards.
|
VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards.
|
||||||
VMOVDQU (CX)(AX*1), Y1
|
VPXOR (CX)(AX*1), Y0, Y0
|
||||||
VPXOR Y0, Y1, Y0
|
|
||||||
VMOVDQU Y0, (BX)(AX*1)
|
VMOVDQU Y0, (BX)(AX*1)
|
||||||
ADDQ $32, AX
|
ADDQ $32, AX
|
||||||
CMPQ DX, AX
|
CMPQ DX, AX
|
||||||
JNE loop32b
|
JNE loop32b
|
||||||
|
|
||||||
|
avx2_ret:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
PCALIGN $16
|
||||||
avx2_loop_1b:
|
avx2_loop_1b:
|
||||||
SUBQ $1, DX // XOR 1byte backwards.
|
SUBQ $1, DX // XOR 1byte backwards.
|
||||||
MOVB (SI)(DX*1), DI
|
MOVB (SI)(DX*1), DI
|
||||||
@ -98,25 +100,17 @@ avx2_loop_1b:
|
|||||||
MOVB DI, (BX)(DX*1)
|
MOVB DI, (BX)(DX*1)
|
||||||
TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b.
|
TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b.
|
||||||
JNZ avx2_loop_1b
|
JNZ avx2_loop_1b
|
||||||
CMPQ DX, $0 // if len is 0, ret.
|
|
||||||
JE avx2_ret
|
|
||||||
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
|
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
|
||||||
JZ avx2_aligned
|
JZ avx2_aligned
|
||||||
|
|
||||||
avx2_not_aligned:
|
avx2_not_aligned:
|
||||||
TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b.
|
TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b.
|
||||||
JNE avx2_loop_1b
|
JNE avx2_loop_1b
|
||||||
TESTQ $8, DX // AND $8 & len, if zero jump to avx2_16b.
|
TESTQ $8, DX // AND $8 & len, if zero jump to avx2_aligned.
|
||||||
JE avx2_16b
|
JE avx2_aligned
|
||||||
SUBQ $8, DX // XOR 8bytes backwards.
|
SUBQ $8, DX // XOR 8bytes backwards.
|
||||||
MOVQ (SI)(DX*1), DI
|
MOVQ (SI)(DX*1), DI
|
||||||
MOVQ (CX)(DX*1), AX
|
MOVQ (CX)(DX*1), AX
|
||||||
XORQ AX, DI
|
XORQ AX, DI
|
||||||
MOVQ DI, (BX)(DX*1)
|
MOVQ DI, (BX)(DX*1)
|
||||||
avx2_16b:
|
JMP avx2_aligned
|
||||||
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
|
|
||||||
JGE avx2_aligned
|
|
||||||
|
|
||||||
avx2_ret:
|
|
||||||
VZEROUPPER
|
|
||||||
RET
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user