internal/sm2ec: use ADX for mul/sqr and internal sqr

This commit is contained in:
Sun Yimin 2024-02-27 17:44:30 +08:00 committed by GitHub
parent 51fc24c704
commit ee7af1bda3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 185 additions and 184 deletions

View File

@ -109,65 +109,65 @@ sqrLoop:
RET RET
sqrBMI2: sqrBMI2:
// y[1:] * y[0] XORQ acc0, acc0
XORQ y_ptr, y_ptr
// x[1:] * x[0]
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADOXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADCQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc4 ADOXQ y_ptr, acc4
// y[2:] * y[1] // x[2:] * x[1]
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, t1 MULXQ (8*2)(x_ptr), AX, t1
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ t1, acc4
MULXQ (8*3)(x_ptr), AX, acc5 MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5 ADCXQ t1, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCXQ y_ptr, acc5
// y[3] * y[2] // y[x] * x[2]
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr
ADOXQ AX, acc5
ADOXQ acc0, y_ptr
MULXQ (8*3)(x_ptr), AX, y_ptr
ADCQ AX, acc5
ADCQ $0, y_ptr
XORQ t1, t1 XORQ t1, t1
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ acc4, acc4 ADOXQ acc4, acc4
ADCQ acc5, acc5 ADOXQ acc5, acc5
ADCQ y_ptr, y_ptr ADOXQ y_ptr, y_ptr
ADCQ $0, t1 ADOXQ acc0, t1
// Missing products // Missing products
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0 MULXQ DX, acc0, t0
ADDQ t0, acc1 ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc2 ADCXQ AX, acc2
ADCQ t0, acc3 ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc4 ADCXQ AX, acc4
ADCQ t0, acc5 ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr ADCXQ AX, y_ptr
ADCQ t1, x_ptr ADCXQ t1, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduce() p256SqrMontReduce()
@ -901,11 +901,11 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
MULQ acc6 MULQ acc6
ADDQ mul0, acc3 ADDQ mul0, acc3
ADCQ $0, mul1 ADCQ $0, mul1
MOVQ mul1, hlp MOVQ mul1, acc0
MOVQ acc5, mul0 MOVQ acc5, mul0
MULQ acc7 MULQ acc7
ADDQ hlp, t0 ADDQ acc0, t0
ADCQ $0, mul1 ADCQ $0, mul1
ADDQ mul0, t0 ADDQ mul0, t0
ADCQ $0, mul1 ADCQ $0, mul1
@ -955,39 +955,42 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
RET RET
internalSqrBMI2: internalSqrBMI2:
XORQ acc0, acc0
XORQ t2, t2
MOVQ acc4, mul1 MOVQ acc4, mul1
MULXQ acc5, acc1, acc2 MULXQ acc5, acc1, acc2
MULXQ acc6, mul0, acc3 MULXQ acc6, mul0, acc3
ADDQ mul0, acc2 ADOXQ mul0, acc2
MULXQ acc7, mul0, t0 MULXQ acc7, mul0, t0
ADCQ mul0, acc3 ADOXQ mul0, acc3
ADCQ $0, t0 ADOXQ t2, t0
MOVQ acc5, mul1 MOVQ acc5, mul1
MULXQ acc6, mul0, hlp MULXQ acc6, mul0, t3
ADDQ mul0, acc3 ADOXQ mul0, acc3
ADCQ hlp, t0
MULXQ acc7, mul0, t1 MULXQ acc7, mul0, t1
ADCQ $0, t1 ADCXQ t3, mul0
ADDQ mul0, t0 ADOXQ mul0, t0
ADCXQ t2, t1
MOVQ acc6, mul1 MOVQ acc6, mul1
MULXQ acc7, mul0, t2 MULXQ acc7, mul0, t2
ADCQ mul0, t1 ADOXQ mul0, t1
ADCQ $0, t2 ADOXQ acc0, t2
XORQ t3, t3 XORQ t3, t3
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ t0, t0 ADOXQ t0, t0
ADCQ t1, t1 ADOXQ t1, t1
ADCQ t2, t2 ADOXQ t2, t2
ADCQ $0, t3 ADOXQ acc0, t3
// Missing products // Missing products
MOVQ acc4, mul1 MOVQ acc4, mul1
@ -996,18 +999,18 @@ internalSqrBMI2:
MOVQ acc5, mul1 MOVQ acc5, mul1
MULXQ mul1, mul0, acc4 MULXQ mul1, mul0, acc4
ADCQ mul0, acc2 ADCXQ mul0, acc2
ADCQ acc4, acc3 ADCXQ acc4, acc3
MOVQ acc6, mul1 MOVQ acc6, mul1
MULXQ mul1, mul0, acc4 MULXQ mul1, mul0, acc4
ADCQ mul0, t0 ADCXQ mul0, t0
ADCQ acc4, t1 ADCXQ acc4, t1
MOVQ acc7, mul1 MOVQ acc7, mul1
MULXQ mul1, mul0, acc4 MULXQ mul1, mul0, acc4
ADCQ mul0, t2 ADCXQ mul0, t2
ADCQ acc4, t3 ADCXQ acc4, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal() sm2P256SqrReductionInternal()

View File

@ -168,7 +168,6 @@ TEXT ·p256NegCond(SB),NOSPLIT,$0
/* ---------------------------------------*/ /* ---------------------------------------*/
// func p256Mul(res, in1, in2 *p256Element) // func p256Mul(res, in1, in2 *p256Element)
TEXT ·p256Mul(SB),NOSPLIT,$0 TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in1+8(FP), x_ptr MOVQ in1+8(FP), x_ptr
MOVQ in2+16(FP), y_ptr MOVQ in2+16(FP), y_ptr
@ -372,25 +371,27 @@ TEXT ·p256Mul(SB),NOSPLIT,$0
ADCQ acc3, acc1 ADCQ acc3, acc1
ADCQ $0, acc2 ADCQ $0, acc2
MOVQ res+0(FP), res_ptr
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET RET
mulBMI2: mulBMI2:
XORQ acc5, acc5
XORQ res_ptr, res_ptr
// x * y[0] // x * y[0]
MOVQ (8*0)(y_ptr), DX MOVQ (8*0)(y_ptr), DX
MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*0)(x_ptr), acc0, acc1
MULXQ (8*1)(x_ptr), AX, acc2 MULXQ (8*1)(x_ptr), AX, acc2
ADDQ AX, acc1 ADCXQ AX, acc1
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADCQ AX, acc2 ADCXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADCQ AX, acc3 ADCXQ AX, acc3
ADCQ $0, acc4 ADCXQ acc5, acc4
XORQ acc5, acc5
// First reduction step // First reduction step
MOVQ acc0, AX MOVQ acc0, AX
MOVQ acc0, DX MOVQ acc0, DX
@ -403,34 +404,32 @@ mulBMI2:
MOVQ acc0, AX MOVQ acc0, AX
SBBQ DX, acc0 SBBQ DX, acc0
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ $0, acc2 ADOXQ res_ptr, acc2
ADCQ $0, acc3 ADOXQ res_ptr, acc3
ADCQ acc0, acc4 ADOXQ acc0, acc4
ADCQ $0, acc5 ADOXQ res_ptr, acc5
XORQ acc0, acc0 XORQ acc0, acc0
// x * y[1] // x * y[1]
MOVQ (8*1)(y_ptr), DX MOVQ (8*1)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, BX MULXQ (8*0)(x_ptr), AX, t0
ADDQ AX, acc1 ADOXQ AX, acc1
ADCQ BX, acc2
MULXQ (8*1)(x_ptr), AX, BX MULXQ (8*1)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ BX, acc3
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, BX ADCXQ BX, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*3)(x_ptr), AX, BX MULXQ (8*3)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5 ADCXQ acc0, BX
ADCQ $0, acc0 ADOXQ BX, acc5
ADOXQ res_ptr, acc0
// Second reduction step // Second reduction step
MOVQ acc1, AX MOVQ acc1, AX
@ -444,35 +443,33 @@ mulBMI2:
MOVQ acc1, AX MOVQ acc1, AX
SBBQ DX, acc1 SBBQ DX, acc1
ADDQ AX, acc2 ADOXQ AX, acc2
ADCQ $0, acc3 ADOXQ res_ptr, acc3
ADCQ $0, acc4 ADOXQ res_ptr, acc4
ADCQ acc1, acc5 ADOXQ acc1, acc5
ADCQ $0, acc0 ADOXQ res_ptr, acc0
XORQ acc1, acc1 XORQ acc1, acc1
// x * y[2] // x * y[2]
MOVQ (8*2)(y_ptr), DX MOVQ (8*2)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t0
MULXQ (8*0)(x_ptr), AX, BX ADOXQ AX, acc2
ADDQ AX, acc2
ADCQ BX, acc3
MULXQ (8*1)(x_ptr), AX, BX MULXQ (8*1)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, BX ADCXQ BX, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5
MULXQ (8*3)(x_ptr), AX, BX MULXQ (8*3)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ BX, acc0 ADCXQ res_ptr, BX
ADCQ $0, acc1 ADOXQ BX, acc0
ADOXQ res_ptr, acc1
// Third reduction step // Third reduction step
MOVQ acc2, AX MOVQ acc2, AX
MOVQ acc2, DX MOVQ acc2, DX
@ -485,35 +482,33 @@ mulBMI2:
MOVQ acc2, AX MOVQ acc2, AX
SBBQ DX, acc2 SBBQ DX, acc2
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc4 ADOXQ res_ptr, acc4
ADCQ $0, acc5 ADOXQ res_ptr, acc5
ADCQ acc2, acc0 ADOXQ acc2, acc0
ADCQ $0, acc1 ADOXQ res_ptr, acc1
XORQ acc2, acc2 XORQ acc2, acc2
// x * y[3] // x * y[3]
MOVQ (8*3)(y_ptr), DX MOVQ (8*3)(y_ptr), DX
MULXQ (8*0)(x_ptr), AX, t0
MULXQ (8*0)(x_ptr), AX, BX ADOXQ AX, acc3
ADDQ AX, acc3
ADCQ BX, acc4
MULXQ (8*1)(x_ptr), AX, BX MULXQ (8*1)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ BX, acc5
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, t0
ADCQ $0, BX ADCXQ BX, AX
ADDQ AX, acc5 ADOXQ AX, acc5
ADCQ BX, acc0
MULXQ (8*3)(x_ptr), AX, BX MULXQ (8*3)(x_ptr), AX, BX
ADCQ $0, BX ADCXQ t0, AX
ADDQ AX, acc0 ADOXQ AX, acc0
ADCQ BX, acc1 ADCXQ res_ptr, BX
ADCQ $0, acc2 ADOXQ BX, acc1
ADOXQ res_ptr, acc2
// Last reduction step // Last reduction step
MOVQ acc3, AX MOVQ acc3, AX
MOVQ acc3, DX MOVQ acc3, DX
@ -526,12 +521,13 @@ mulBMI2:
MOVQ acc3, AX MOVQ acc3, AX
SBBQ DX, acc3 SBBQ DX, acc3
ADDQ AX, acc4 ADOXQ AX, acc4
ADCQ $0, acc5 ADOXQ res_ptr, acc5
ADCQ $0, acc0 ADOXQ res_ptr, acc0
ADCQ acc3, acc1 ADOXQ acc3, acc1
ADCQ $0, acc2 ADOXQ res_ptr, acc2
MOVQ res+0(FP), res_ptr
p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr) p256PrimReduce(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, BX, res_ptr)
RET RET

View File

@ -109,65 +109,64 @@ sqrLoop:
RET RET
sqrBMI2: sqrBMI2:
// y[1:] * y[0] XORQ acc0, acc0
XORQ y_ptr, y_ptr
// x[1:] * x[0]
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*1)(x_ptr), acc1, acc2
MULXQ (8*2)(x_ptr), AX, acc3 MULXQ (8*2)(x_ptr), AX, acc3
ADDQ AX, acc2 ADOXQ AX, acc2
MULXQ (8*3)(x_ptr), AX, acc4 MULXQ (8*3)(x_ptr), AX, acc4
ADCQ AX, acc3 ADOXQ AX, acc3
ADCQ $0, acc4 ADOXQ y_ptr, acc4
// y[2:] * y[1] // x[2:] * x[1]
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ (8*2)(x_ptr), AX, BX MULXQ (8*2)(x_ptr), AX, BX
ADDQ AX, acc3 ADOXQ AX, acc3
ADCQ BX, acc4
MULXQ (8*3)(x_ptr), AX, acc5 MULXQ (8*3)(x_ptr), AX, acc5
ADCQ $0, acc5 ADCXQ BX, AX
ADDQ AX, acc4 ADOXQ AX, acc4
ADCXQ y_ptr, acc5
// y[3] * y[2] // x[3] * x[2]
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ (8*3)(x_ptr), AX, y_ptr MULXQ (8*3)(x_ptr), AX, y_ptr
ADCQ AX, acc5 ADOXQ AX, acc5
ADCQ $0, y_ptr ADOXQ acc0, y_ptr
XORQ BX, BX XORQ BX, BX
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ acc4, acc4 ADOXQ acc4, acc4
ADCQ acc5, acc5 ADOXQ acc5, acc5
ADCQ y_ptr, y_ptr ADOXQ y_ptr, y_ptr
ADCQ $0, BX ADOXQ acc0, BX
// Missing products // Missing products
MOVQ (8*0)(x_ptr), DX MOVQ (8*0)(x_ptr), DX
MULXQ DX, acc0, t0 MULXQ DX, acc0, t0
ADDQ t0, acc1 ADCXQ t0, acc1
MOVQ (8*1)(x_ptr), DX MOVQ (8*1)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc2 ADCXQ AX, acc2
ADCQ t0, acc3 ADCXQ t0, acc3
MOVQ (8*2)(x_ptr), DX MOVQ (8*2)(x_ptr), DX
MULXQ DX, AX, t0 MULXQ DX, AX, t0
ADCQ AX, acc4 ADCXQ AX, acc4
ADCQ t0, acc5 ADCXQ t0, acc5
MOVQ (8*3)(x_ptr), DX MOVQ (8*3)(x_ptr), DX
MULXQ DX, AX, x_ptr MULXQ DX, AX, x_ptr
ADCQ AX, y_ptr ADCXQ AX, y_ptr
ADCQ BX, x_ptr ADCXQ BX, x_ptr
// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0] // T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
p256SqrMontReduce() p256SqrMontReduce()
@ -931,65 +930,68 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
MOVQ acc4, mul0 MOVQ acc4, mul0
MULQ mul0 MULQ mul0
MOVQ mul0, acc0 MOVQ mul0, acc0
MOVQ DX, acc4 MOVQ mul1, acc4
MOVQ acc5, mul0 MOVQ acc5, mul0
MULQ mul0 MULQ mul0
ADDQ acc4, acc1 ADDQ acc4, acc1
ADCQ mul0, acc2 ADCQ mul0, acc2
ADCQ $0, DX ADCQ $0, mul1
MOVQ DX, acc4 MOVQ mul1, acc4
MOVQ acc6, mul0 MOVQ acc6, mul0
MULQ mul0 MULQ mul0
ADDQ acc4, acc3 ADDQ acc4, acc3
ADCQ mul0, t0 ADCQ mul0, t0
ADCQ $0, DX ADCQ $0, mul1
MOVQ DX, acc4 MOVQ mul1, acc4
MOVQ acc7, mul0 MOVQ acc7, mul0
MULQ mul0 MULQ mul0
ADDQ acc4, t1 ADDQ acc4, t1
ADCQ mul0, t2 ADCQ mul0, t2
ADCQ DX, t3 ADCQ mul1, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal() sm2P256SqrReductionInternal()
RET RET
internalSqrBMI2: internalSqrBMI2:
XORQ acc0, acc0
XORQ t2, t2
MOVQ acc4, mul1 MOVQ acc4, mul1
MULXQ acc5, acc1, acc2 MULXQ acc5, acc1, acc2
MULXQ acc6, mul0, acc3 MULXQ acc6, mul0, acc3
ADDQ mul0, acc2 ADOXQ mul0, acc2
MULXQ acc7, mul0, t0 MULXQ acc7, mul0, t0
ADCQ mul0, acc3 ADOXQ mul0, acc3
ADCQ $0, t0 ADOXQ t2, t0
MOVQ acc5, mul1 MOVQ acc5, mul1
MULXQ acc6, mul0, acc0 MULXQ acc6, mul0, t3
ADDQ mul0, acc3 ADOXQ mul0, acc3
ADCQ acc0, t0
MULXQ acc7, mul0, t1 MULXQ acc7, mul0, t1
ADCQ $0, t1 ADCXQ t3, mul0
ADDQ mul0, t0 ADOXQ mul0, t0
ADCXQ t2, t1
MOVQ acc6, mul1 MOVQ acc6, mul1
MULXQ acc7, mul0, t2 MULXQ acc7, mul0, t2
ADCQ mul0, t1 ADOXQ mul0, t1
ADCQ $0, t2 ADOXQ acc0, t2
XORQ t3, t3 XORQ t3, t3
// *2 // *2
ADDQ acc1, acc1 ADOXQ acc1, acc1
ADCQ acc2, acc2 ADOXQ acc2, acc2
ADCQ acc3, acc3 ADOXQ acc3, acc3
ADCQ t0, t0 ADOXQ t0, t0
ADCQ t1, t1 ADOXQ t1, t1
ADCQ t2, t2 ADOXQ t2, t2
ADCQ $0, t3 ADOXQ acc0, t3
// Missing products // Missing products
MOVQ acc4, mul1 MOVQ acc4, mul1
@ -998,18 +1000,18 @@ internalSqrBMI2:
MOVQ acc5, mul1 MOVQ acc5, mul1
MULXQ mul1, mul0, acc4 MULXQ mul1, mul0, acc4
ADCQ mul0, acc2 ADCXQ mul0, acc2
ADCQ acc4, acc3 ADCXQ acc4, acc3
MOVQ acc6, mul1 MOVQ acc6, mul1
MULXQ mul1, mul0, acc4 MULXQ mul1, mul0, acc4
ADCQ mul0, t0 ADCXQ mul0, t0
ADCQ acc4, t1 ADCXQ acc4, t1
MOVQ acc7, mul1 MOVQ acc7, mul1
MULXQ mul1, mul0, acc4 MULXQ mul1, mul0, acc4
ADCQ mul0, t2 ADCXQ mul0, t2
ADCQ acc4, t3 ADCXQ acc4, t3
// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0] // T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
sm2P256SqrReductionInternal() sm2P256SqrReductionInternal()