diff --git a/internal/sm2ec/p256_plugin_amd64.s b/internal/sm2ec/p256_plugin_amd64.s
index f1731a8..b6df6e0 100644
--- a/internal/sm2ec/p256_plugin_amd64.s
+++ b/internal/sm2ec/p256_plugin_amd64.s
@@ -13,16 +13,15 @@
 
 /* ---------------------------------------*/
 // func p256Sqr(res, in *p256Element, n int)
-TEXT ·p256Sqr(SB),NOSPLIT,$8-24
+TEXT ·p256Sqr(SB),NOSPLIT,$0
 	MOVQ res+0(FP), res_ptr
 	MOVQ in+8(FP), x_ptr
-	MOVQ n+16(FP), BX
+	MOVQ n+16(FP), BP
 	
 	CMPB ·supportBMI2+0(SB), $0x01
 	JEQ  sqrBMI2
 
 sqrLoop:
-	MOVQ BX, (SP)
 	// y[1:] * y[0]
 	MOVQ (8*0)(x_ptr), t0
 
@@ -106,13 +105,11 @@ sqrLoop:
 	p256SqrMontReduce()
 	p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
 	MOVQ res_ptr, x_ptr
-	MOVQ (SP), BX  
-	DECQ BX
+	DECQ BP
 	JNE  sqrLoop
 	RET
 	
 sqrBMI2:
-	MOVQ BX, (SP)
 	// y[1:] * y[0]
 	MOVQ (8*0)(x_ptr), DX
 
@@ -177,23 +174,21 @@ sqrBMI2:
 	p256SqrMontReduce()
 	p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
 	MOVQ res_ptr, x_ptr            
-	MOVQ (SP), BX  
-	DECQ BX
+	DECQ BP
 	JNE  sqrBMI2
 	RET
 
 /* ---------------------------------------*/
 // func p256OrdSqr(res, in *p256OrdElement, n int)
-TEXT ·p256OrdSqr(SB),NOSPLIT,$8-24
+TEXT ·p256OrdSqr(SB),NOSPLIT,$0
 	MOVQ res+0(FP), res_ptr
 	MOVQ in+8(FP), x_ptr
-	MOVQ n+16(FP), BX
+	MOVQ n+16(FP), BP
 
 	CMPB ·supportBMI2+0(SB), $0x01
 	JEQ  ordSqrLoopBMI2
 
 ordSqrLoop:
-	MOVQ BX, (SP)
 	// y[1:] * y[0]
 	MOVQ (8*0)(x_ptr), t0
 
@@ -406,14 +401,12 @@ ordSqrLoop:
 
 	p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
 	MOVQ res_ptr, x_ptr
-	MOVQ (SP), BX
-	DECQ BX
+	DECQ BP
 	JNE ordSqrLoop
 
 	RET
 
 ordSqrLoopBMI2:
-	MOVQ BX, (SP)
 	// y[1:] * y[0]
 	MOVQ (8*0)(x_ptr), DX
 	MULXQ (8*1)(x_ptr), acc1, acc2 
@@ -587,8 +580,7 @@ ordSqrLoopBMI2:
 
 	p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, BX, res_ptr)
 	MOVQ res_ptr, x_ptr
-	MOVQ (SP), BX
-	DECQ BX
+	DECQ BP
 	JNE ordSqrLoopBMI2
 
 	RET
diff --git a/sm9/bn256/gfp_plugin_amd64.s b/sm9/bn256/gfp_plugin_amd64.s
index cce02b2..5ca3ed4 100644
--- a/sm9/bn256/gfp_plugin_amd64.s
+++ b/sm9/bn256/gfp_plugin_amd64.s
@@ -5,16 +5,15 @@
 #include "gfp_macros_amd64.s"
 
 // func gfpSqr(res, in *gfP, n int)
-TEXT ·gfpSqr(SB),NOSPLIT,$24-8
+TEXT ·gfpSqr(SB),NOSPLIT,$0
 	MOVQ res+0(FP), res_ptr
 	MOVQ in+8(FP), x_ptr
-	MOVQ n+16(FP), BX
+	MOVQ n+16(FP), BP
 
 	CMPB ·supportADX(SB), $0
 	JE   gfpSqrLoop
 
 gfpSqrLoopAdx:
-	MOVQ BX, (SP)
 	XORQ acc0, acc0
 	XORQ y_ptr, y_ptr
 	// y[1:] * y[0]
@@ -174,14 +173,12 @@ gfpSqrLoopAdx:
 	storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
 
 	MOVQ res_ptr, x_ptr
-	MOVQ (SP), BX
-	DECQ BX
+	DECQ BP
 	JNE gfpSqrLoopAdx
 
 	RET
 
 gfpSqrLoop:
-	MOVQ BX, (SP)
 	// y[1:] * y[0]
 	MOVQ (8*0)(x_ptr), t0
 
@@ -416,8 +413,7 @@ gfpSqrLoop:
 	gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0)
 	storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
 	MOVQ res_ptr, x_ptr
-	MOVQ (SP), BX
-	DECQ BX
+	DECQ BP
 	JNE gfpSqrLoop
 
 	RET