mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-27 20:56:18 +08:00
internal/sm2ec: use ADX for mul/sqr and internal sqr
This commit is contained in:
parent
51fc24c704
commit
ee7af1bda3
@ -109,65 +109,65 @@ sqrLoop:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
sqrBMI2:
|
sqrBMI2:
|
||||||
// y[1:] * y[0]
|
XORQ acc0, acc0
|
||||||
|
XORQ y_ptr, y_ptr
|
||||||
|
// x[1:] * x[0]
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADCQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ y_ptr, acc4
|
||||||
|
|
||||||
// y[2:] * y[1]
|
// x[2:] * x[1]
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, t1
|
MULXQ (8*2)(x_ptr), AX, t1
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ t1, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc5
|
MULXQ (8*3)(x_ptr), AX, acc5
|
||||||
ADCQ $0, acc5
|
ADCXQ t1, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
|
ADCXQ y_ptr, acc5
|
||||||
|
|
||||||
// y[3] * y[2]
|
// y[x] * x[2]
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||||
|
ADOXQ AX, acc5
|
||||||
|
ADOXQ acc0, y_ptr
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
|
||||||
ADCQ AX, acc5
|
|
||||||
ADCQ $0, y_ptr
|
|
||||||
XORQ t1, t1
|
XORQ t1, t1
|
||||||
|
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ acc4, acc4
|
ADOXQ acc4, acc4
|
||||||
ADCQ acc5, acc5
|
ADOXQ acc5, acc5
|
||||||
ADCQ y_ptr, y_ptr
|
ADOXQ y_ptr, y_ptr
|
||||||
ADCQ $0, t1
|
ADOXQ acc0, t1
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ DX, acc0, t0
|
MULXQ DX, acc0, t0
|
||||||
ADDQ t0, acc1
|
ADCXQ t0, acc1
|
||||||
|
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ t0, acc3
|
ADCXQ t0, acc3
|
||||||
|
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc4
|
ADCXQ AX, acc4
|
||||||
ADCQ t0, acc5
|
ADCXQ t0, acc5
|
||||||
|
|
||||||
MOVQ (8*3)(x_ptr), DX
|
MOVQ (8*3)(x_ptr), DX
|
||||||
MULXQ DX, AX, x_ptr
|
MULXQ DX, AX, x_ptr
|
||||||
ADCQ AX, y_ptr
|
ADCXQ AX, y_ptr
|
||||||
ADCQ t1, x_ptr
|
ADCXQ t1, x_ptr
|
||||||
|
|
||||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
p256SqrMontReduce()
|
p256SqrMontReduce()
|
||||||
@ -901,11 +901,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
|||||||
MULQ acc6
|
MULQ acc6
|
||||||
ADDQ mul0, acc3
|
ADDQ mul0, acc3
|
||||||
ADCQ $0, mul1
|
ADCQ $0, mul1
|
||||||
MOVQ mul1, hlp
|
MOVQ mul1, acc0
|
||||||
|
|
||||||
MOVQ acc5, mul0
|
MOVQ acc5, mul0
|
||||||
MULQ acc7
|
MULQ acc7
|
||||||
ADDQ hlp, t0
|
ADDQ acc0, t0
|
||||||
ADCQ $0, mul1
|
ADCQ $0, mul1
|
||||||
ADDQ mul0, t0
|
ADDQ mul0, t0
|
||||||
ADCQ $0, mul1
|
ADCQ $0, mul1
|
||||||
@ -955,39 +955,42 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
internalSqrBMI2:
|
internalSqrBMI2:
|
||||||
|
XORQ acc0, acc0
|
||||||
|
XORQ t2, t2
|
||||||
MOVQ acc4, mul1
|
MOVQ acc4, mul1
|
||||||
MULXQ acc5, acc1, acc2
|
MULXQ acc5, acc1, acc2
|
||||||
|
|
||||||
MULXQ acc6, mul0, acc3
|
MULXQ acc6, mul0, acc3
|
||||||
ADDQ mul0, acc2
|
ADOXQ mul0, acc2
|
||||||
|
|
||||||
MULXQ acc7, mul0, t0
|
MULXQ acc7, mul0, t0
|
||||||
ADCQ mul0, acc3
|
ADOXQ mul0, acc3
|
||||||
ADCQ $0, t0
|
ADOXQ t2, t0
|
||||||
|
|
||||||
MOVQ acc5, mul1
|
MOVQ acc5, mul1
|
||||||
MULXQ acc6, mul0, hlp
|
MULXQ acc6, mul0, t3
|
||||||
ADDQ mul0, acc3
|
ADOXQ mul0, acc3
|
||||||
ADCQ hlp, t0
|
|
||||||
|
|
||||||
MULXQ acc7, mul0, t1
|
MULXQ acc7, mul0, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t3, mul0
|
||||||
ADDQ mul0, t0
|
ADOXQ mul0, t0
|
||||||
|
ADCXQ t2, t1
|
||||||
|
|
||||||
MOVQ acc6, mul1
|
MOVQ acc6, mul1
|
||||||
MULXQ acc7, mul0, t2
|
MULXQ acc7, mul0, t2
|
||||||
ADCQ mul0, t1
|
ADOXQ mul0, t1
|
||||||
ADCQ $0, t2
|
ADOXQ acc0, t2
|
||||||
|
|
||||||
XORQ t3, t3
|
XORQ t3, t3
|
||||||
|
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ t0, t0
|
ADOXQ t0, t0
|
||||||
ADCQ t1, t1
|
ADOXQ t1, t1
|
||||||
ADCQ t2, t2
|
ADOXQ t2, t2
|
||||||
ADCQ $0, t3
|
ADOXQ acc0, t3
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ acc4, mul1
|
MOVQ acc4, mul1
|
||||||
@ -996,18 +999,18 @@ internalSqrBMI2:
|
|||||||
|
|
||||||
MOVQ acc5, mul1
|
MOVQ acc5, mul1
|
||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, acc2
|
ADCXQ mul0, acc2
|
||||||
ADCQ acc4, acc3
|
ADCXQ acc4, acc3
|
||||||
|
|
||||||
MOVQ acc6, mul1
|
MOVQ acc6, mul1
|
||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, t0
|
ADCXQ mul0, t0
|
||||||
ADCQ acc4, t1
|
ADCXQ acc4, t1
|
||||||
|
|
||||||
MOVQ acc7, mul1
|
MOVQ acc7, mul1
|
||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, t2
|
ADCXQ mul0, t2
|
||||||
ADCQ acc4, t3
|
ADCXQ acc4, t3
|
||||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||||
sm2P256SqrReductionInternal()
|
sm2P256SqrReductionInternal()
|
||||||
|
|
||||||
|
@ -168,7 +168,6 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
|
|||||||
/* ---------------------------------------*/
|
/* ---------------------------------------*/
|
||||||
// func p256Mul(res, in1, in2 *p256Element)
|
// func p256Mul(res, in1, in2 *p256Element)
|
||||||
TEXT ·p256Mul(SB),NOSPLIT,$0
|
TEXT ·p256Mul(SB),NOSPLIT,$0
|
||||||
MOVQ res+0(FP), res_ptr
|
|
||||||
MOVQ in1+8(FP), x_ptr
|
MOVQ in1+8(FP), x_ptr
|
||||||
MOVQ in2+16(FP), y_ptr
|
MOVQ in2+16(FP), y_ptr
|
||||||
|
|
||||||
@ -372,25 +371,27 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
|
|||||||
ADCQ acc3, acc1
|
ADCQ acc3, acc1
|
||||||
ADCQ $0, acc2
|
ADCQ $0, acc2
|
||||||
|
|
||||||
|
MOVQ res+0(FP), res_ptr
|
||||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
mulBMI2:
|
mulBMI2:
|
||||||
|
XORQ acc5, acc5
|
||||||
|
XORQ res_ptr, res_ptr
|
||||||
// x * y[0]
|
// x * y[0]
|
||||||
MOVQ (8*0)(y_ptr), DX
|
MOVQ (8*0)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), acc0, acc1
|
MULXQ (8*0)(x_ptr), acc0, acc1
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, acc2
|
MULXQ (8*1)(x_ptr), AX, acc2
|
||||||
ADDQ AX, acc1
|
ADCXQ AX, acc1
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADCQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADCQ AX, acc3
|
ADCXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADCXQ acc5, acc4
|
||||||
|
|
||||||
XORQ acc5, acc5
|
|
||||||
// First reduction step
|
// First reduction step
|
||||||
MOVQ acc0, AX
|
MOVQ acc0, AX
|
||||||
MOVQ acc0, DX
|
MOVQ acc0, DX
|
||||||
@ -403,34 +404,32 @@ mulBMI2:
|
|||||||
MOVQ acc0, AX
|
MOVQ acc0, AX
|
||||||
SBBQ DX, acc0
|
SBBQ DX, acc0
|
||||||
|
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ $0, acc2
|
ADOXQ res_ptr, acc2
|
||||||
ADCQ $0, acc3
|
ADOXQ res_ptr, acc3
|
||||||
ADCQ acc0, acc4
|
ADOXQ acc0, acc4
|
||||||
ADCQ $0, acc5
|
ADOXQ res_ptr, acc5
|
||||||
|
|
||||||
XORQ acc0, acc0
|
XORQ acc0, acc0
|
||||||
// x * y[1]
|
// x * y[1]
|
||||||
MOVQ (8*1)(y_ptr), DX
|
MOVQ (8*1)(y_ptr), DX
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
ADDQ AX, acc1
|
ADOXQ AX, acc1
|
||||||
ADCQ BX, acc2
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, BX
|
MULXQ (8*1)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ BX, acc3
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, BX
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, BX
|
MULXQ (8*3)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
ADCXQ acc0, BX
|
||||||
ADCQ $0, acc0
|
ADOXQ BX, acc5
|
||||||
|
ADOXQ res_ptr, acc0
|
||||||
|
|
||||||
// Second reduction step
|
// Second reduction step
|
||||||
MOVQ acc1, AX
|
MOVQ acc1, AX
|
||||||
@ -444,35 +443,33 @@ mulBMI2:
|
|||||||
MOVQ acc1, AX
|
MOVQ acc1, AX
|
||||||
SBBQ DX, acc1
|
SBBQ DX, acc1
|
||||||
|
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
ADCQ $0, acc3
|
ADOXQ res_ptr, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ res_ptr, acc4
|
||||||
ADCQ acc1, acc5
|
ADOXQ acc1, acc5
|
||||||
ADCQ $0, acc0
|
ADOXQ res_ptr, acc0
|
||||||
|
|
||||||
XORQ acc1, acc1
|
XORQ acc1, acc1
|
||||||
// x * y[2]
|
// x * y[2]
|
||||||
MOVQ (8*2)(y_ptr), DX
|
MOVQ (8*2)(y_ptr), DX
|
||||||
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
ADOXQ AX, acc2
|
||||||
ADDQ AX, acc2
|
|
||||||
ADCQ BX, acc3
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, BX
|
MULXQ (8*1)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, BX
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, BX
|
MULXQ (8*3)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ BX, acc0
|
ADCXQ res_ptr, BX
|
||||||
ADCQ $0, acc1
|
ADOXQ BX, acc0
|
||||||
|
ADOXQ res_ptr, acc1
|
||||||
|
|
||||||
// Third reduction step
|
// Third reduction step
|
||||||
MOVQ acc2, AX
|
MOVQ acc2, AX
|
||||||
MOVQ acc2, DX
|
MOVQ acc2, DX
|
||||||
@ -485,35 +482,33 @@ mulBMI2:
|
|||||||
MOVQ acc2, AX
|
MOVQ acc2, AX
|
||||||
SBBQ DX, acc2
|
SBBQ DX, acc2
|
||||||
|
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ res_ptr, acc4
|
||||||
ADCQ $0, acc5
|
ADOXQ res_ptr, acc5
|
||||||
ADCQ acc2, acc0
|
ADOXQ acc2, acc0
|
||||||
ADCQ $0, acc1
|
ADOXQ res_ptr, acc1
|
||||||
|
|
||||||
XORQ acc2, acc2
|
XORQ acc2, acc2
|
||||||
// x * y[3]
|
// x * y[3]
|
||||||
MOVQ (8*3)(y_ptr), DX
|
MOVQ (8*3)(y_ptr), DX
|
||||||
|
MULXQ (8*0)(x_ptr), AX, t0
|
||||||
MULXQ (8*0)(x_ptr), AX, BX
|
ADOXQ AX, acc3
|
||||||
ADDQ AX, acc3
|
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), AX, BX
|
MULXQ (8*1)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ BX, acc5
|
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, t0
|
||||||
ADCQ $0, BX
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ BX, acc0
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, BX
|
MULXQ (8*3)(x_ptr), AX, BX
|
||||||
ADCQ $0, BX
|
ADCXQ t0, AX
|
||||||
ADDQ AX, acc0
|
ADOXQ AX, acc0
|
||||||
ADCQ BX, acc1
|
ADCXQ res_ptr, BX
|
||||||
ADCQ $0, acc2
|
ADOXQ BX, acc1
|
||||||
|
ADOXQ res_ptr, acc2
|
||||||
|
|
||||||
// Last reduction step
|
// Last reduction step
|
||||||
MOVQ acc3, AX
|
MOVQ acc3, AX
|
||||||
MOVQ acc3, DX
|
MOVQ acc3, DX
|
||||||
@ -526,12 +521,13 @@ mulBMI2:
|
|||||||
MOVQ acc3, AX
|
MOVQ acc3, AX
|
||||||
SBBQ DX, acc3
|
SBBQ DX, acc3
|
||||||
|
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
ADCQ $0, acc5
|
ADOXQ res_ptr, acc5
|
||||||
ADCQ $0, acc0
|
ADOXQ res_ptr, acc0
|
||||||
ADCQ acc3, acc1
|
ADOXQ acc3, acc1
|
||||||
ADCQ $0, acc2
|
ADOXQ res_ptr, acc2
|
||||||
|
|
||||||
|
MOVQ res+0(FP), res_ptr
|
||||||
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
@ -109,65 +109,64 @@ sqrLoop:
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
sqrBMI2:
|
sqrBMI2:
|
||||||
// y[1:] * y[0]
|
XORQ acc0, acc0
|
||||||
|
XORQ y_ptr, y_ptr
|
||||||
|
// x[1:] * x[0]
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
|
|
||||||
MULXQ (8*1)(x_ptr), acc1, acc2
|
MULXQ (8*1)(x_ptr), acc1, acc2
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, acc3
|
MULXQ (8*2)(x_ptr), AX, acc3
|
||||||
ADDQ AX, acc2
|
ADOXQ AX, acc2
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc4
|
MULXQ (8*3)(x_ptr), AX, acc4
|
||||||
ADCQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ $0, acc4
|
ADOXQ y_ptr, acc4
|
||||||
|
|
||||||
// y[2:] * y[1]
|
// x[2:] * x[1]
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
|
|
||||||
MULXQ (8*2)(x_ptr), AX, BX
|
MULXQ (8*2)(x_ptr), AX, BX
|
||||||
ADDQ AX, acc3
|
ADOXQ AX, acc3
|
||||||
ADCQ BX, acc4
|
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, acc5
|
MULXQ (8*3)(x_ptr), AX, acc5
|
||||||
ADCQ $0, acc5
|
ADCXQ BX, AX
|
||||||
ADDQ AX, acc4
|
ADOXQ AX, acc4
|
||||||
|
ADCXQ y_ptr, acc5
|
||||||
|
|
||||||
// y[3] * y[2]
|
// x[3] * x[2]
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
|
|
||||||
MULXQ (8*3)(x_ptr), AX, y_ptr
|
MULXQ (8*3)(x_ptr), AX, y_ptr
|
||||||
ADCQ AX, acc5
|
ADOXQ AX, acc5
|
||||||
ADCQ $0, y_ptr
|
ADOXQ acc0, y_ptr
|
||||||
XORQ BX, BX
|
XORQ BX, BX
|
||||||
|
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ acc4, acc4
|
ADOXQ acc4, acc4
|
||||||
ADCQ acc5, acc5
|
ADOXQ acc5, acc5
|
||||||
ADCQ y_ptr, y_ptr
|
ADOXQ y_ptr, y_ptr
|
||||||
ADCQ $0, BX
|
ADOXQ acc0, BX
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ (8*0)(x_ptr), DX
|
MOVQ (8*0)(x_ptr), DX
|
||||||
MULXQ DX, acc0, t0
|
MULXQ DX, acc0, t0
|
||||||
ADDQ t0, acc1
|
ADCXQ t0, acc1
|
||||||
|
|
||||||
MOVQ (8*1)(x_ptr), DX
|
MOVQ (8*1)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc2
|
ADCXQ AX, acc2
|
||||||
ADCQ t0, acc3
|
ADCXQ t0, acc3
|
||||||
|
|
||||||
MOVQ (8*2)(x_ptr), DX
|
MOVQ (8*2)(x_ptr), DX
|
||||||
MULXQ DX, AX, t0
|
MULXQ DX, AX, t0
|
||||||
ADCQ AX, acc4
|
ADCXQ AX, acc4
|
||||||
ADCQ t0, acc5
|
ADCXQ t0, acc5
|
||||||
|
|
||||||
MOVQ (8*3)(x_ptr), DX
|
MOVQ (8*3)(x_ptr), DX
|
||||||
MULXQ DX, AX, x_ptr
|
MULXQ DX, AX, x_ptr
|
||||||
ADCQ AX, y_ptr
|
ADCXQ AX, y_ptr
|
||||||
ADCQ BX, x_ptr
|
ADCXQ BX, x_ptr
|
||||||
|
|
||||||
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
|
||||||
p256SqrMontReduce()
|
p256SqrMontReduce()
|
||||||
@ -931,65 +930,68 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
|
|||||||
MOVQ acc4, mul0
|
MOVQ acc4, mul0
|
||||||
MULQ mul0
|
MULQ mul0
|
||||||
MOVQ mul0, acc0
|
MOVQ mul0, acc0
|
||||||
MOVQ DX, acc4
|
MOVQ mul1, acc4
|
||||||
|
|
||||||
MOVQ acc5, mul0
|
MOVQ acc5, mul0
|
||||||
MULQ mul0
|
MULQ mul0
|
||||||
ADDQ acc4, acc1
|
ADDQ acc4, acc1
|
||||||
ADCQ mul0, acc2
|
ADCQ mul0, acc2
|
||||||
ADCQ $0, DX
|
ADCQ $0, mul1
|
||||||
MOVQ DX, acc4
|
MOVQ mul1, acc4
|
||||||
|
|
||||||
MOVQ acc6, mul0
|
MOVQ acc6, mul0
|
||||||
MULQ mul0
|
MULQ mul0
|
||||||
ADDQ acc4, acc3
|
ADDQ acc4, acc3
|
||||||
ADCQ mul0, t0
|
ADCQ mul0, t0
|
||||||
ADCQ $0, DX
|
ADCQ $0, mul1
|
||||||
MOVQ DX, acc4
|
MOVQ mul1, acc4
|
||||||
|
|
||||||
MOVQ acc7, mul0
|
MOVQ acc7, mul0
|
||||||
MULQ mul0
|
MULQ mul0
|
||||||
ADDQ acc4, t1
|
ADDQ acc4, t1
|
||||||
ADCQ mul0, t2
|
ADCQ mul0, t2
|
||||||
ADCQ DX, t3
|
ADCQ mul1, t3
|
||||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||||
sm2P256SqrReductionInternal()
|
sm2P256SqrReductionInternal()
|
||||||
RET
|
RET
|
||||||
|
|
||||||
internalSqrBMI2:
|
internalSqrBMI2:
|
||||||
|
XORQ acc0, acc0
|
||||||
|
XORQ t2, t2
|
||||||
MOVQ acc4, mul1
|
MOVQ acc4, mul1
|
||||||
MULXQ acc5, acc1, acc2
|
MULXQ acc5, acc1, acc2
|
||||||
|
|
||||||
MULXQ acc6, mul0, acc3
|
MULXQ acc6, mul0, acc3
|
||||||
ADDQ mul0, acc2
|
ADOXQ mul0, acc2
|
||||||
|
|
||||||
MULXQ acc7, mul0, t0
|
MULXQ acc7, mul0, t0
|
||||||
ADCQ mul0, acc3
|
ADOXQ mul0, acc3
|
||||||
ADCQ $0, t0
|
ADOXQ t2, t0
|
||||||
|
|
||||||
MOVQ acc5, mul1
|
MOVQ acc5, mul1
|
||||||
MULXQ acc6, mul0, acc0
|
MULXQ acc6, mul0, t3
|
||||||
ADDQ mul0, acc3
|
ADOXQ mul0, acc3
|
||||||
ADCQ acc0, t0
|
|
||||||
|
|
||||||
MULXQ acc7, mul0, t1
|
MULXQ acc7, mul0, t1
|
||||||
ADCQ $0, t1
|
ADCXQ t3, mul0
|
||||||
ADDQ mul0, t0
|
ADOXQ mul0, t0
|
||||||
|
ADCXQ t2, t1
|
||||||
|
|
||||||
MOVQ acc6, mul1
|
MOVQ acc6, mul1
|
||||||
MULXQ acc7, mul0, t2
|
MULXQ acc7, mul0, t2
|
||||||
ADCQ mul0, t1
|
ADOXQ mul0, t1
|
||||||
ADCQ $0, t2
|
ADOXQ acc0, t2
|
||||||
|
|
||||||
XORQ t3, t3
|
XORQ t3, t3
|
||||||
|
|
||||||
// *2
|
// *2
|
||||||
ADDQ acc1, acc1
|
ADOXQ acc1, acc1
|
||||||
ADCQ acc2, acc2
|
ADOXQ acc2, acc2
|
||||||
ADCQ acc3, acc3
|
ADOXQ acc3, acc3
|
||||||
ADCQ t0, t0
|
ADOXQ t0, t0
|
||||||
ADCQ t1, t1
|
ADOXQ t1, t1
|
||||||
ADCQ t2, t2
|
ADOXQ t2, t2
|
||||||
ADCQ $0, t3
|
ADOXQ acc0, t3
|
||||||
|
|
||||||
// Missing products
|
// Missing products
|
||||||
MOVQ acc4, mul1
|
MOVQ acc4, mul1
|
||||||
@ -998,18 +1000,18 @@ internalSqrBMI2:
|
|||||||
|
|
||||||
MOVQ acc5, mul1
|
MOVQ acc5, mul1
|
||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, acc2
|
ADCXQ mul0, acc2
|
||||||
ADCQ acc4, acc3
|
ADCXQ acc4, acc3
|
||||||
|
|
||||||
MOVQ acc6, mul1
|
MOVQ acc6, mul1
|
||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, t0
|
ADCXQ mul0, t0
|
||||||
ADCQ acc4, t1
|
ADCXQ acc4, t1
|
||||||
|
|
||||||
MOVQ acc7, mul1
|
MOVQ acc7, mul1
|
||||||
MULXQ mul1, mul0, acc4
|
MULXQ mul1, mul0, acc4
|
||||||
ADCQ mul0, t2
|
ADCXQ mul0, t2
|
||||||
ADCQ acc4, t3
|
ADCXQ acc4, t3
|
||||||
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
|
||||||
sm2P256SqrReductionInternal()
|
sm2P256SqrReductionInternal()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user