diff --git a/sm9/bn256/gfp_decl.go b/sm9/bn256/gfp_decl.go index e91d7ba..330add5 100644 --- a/sm9/bn256/gfp_decl.go +++ b/sm9/bn256/gfp_decl.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego package bn256 diff --git a/sm9/bn256/gfp_generic.go b/sm9/bn256/gfp_generic.go index 5ea9c17..5c39a9c 100644 --- a/sm9/bn256/gfp_generic.go +++ b/sm9/bn256/gfp_generic.go @@ -1,4 +1,4 @@ -//go:build purego || !(amd64 || arm64) +//go:build purego || !(amd64 || arm64 || ppc64 || ppc64le) package bn256 diff --git a/sm9/bn256/gfp_ppc64x.go b/sm9/bn256/gfp_ppc64x.go deleted file mode 100644 index da2504f..0000000 --- a/sm9/bn256/gfp_ppc64x.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2024 Sun Yimin. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -//go:build (ppc64 || ppc64le) && !purego - -package bn256 - -// Set c = p - a, if c == p, then c = 0 -// It seems this function's performance is worse than gfpSub with zero. -// -//go:noescape -func gfpNegAsm(c, a *gfP) - -// Set c = a + b, if c >= p, then c = c - p -// -//go:noescape -func gfpAddAsm(c, a, b *gfP) - -// Set c = a + a -// -//go:noescape -func gfpDoubleAsm(c, a *gfP) - -// Set c = a + a + a -// -//go:noescape -func gfpTripleAsm(c, a *gfP) - -// Set c = a - b, if c is negative, then c = c + p -// -//go:noescape -func gfpSubAsm(c, a, b *gfP) - -// Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p. -// -//go:noescape -func gfpMulAsm(c, a, b *gfP) - -// Montgomery square, repeated n times (n >= 1). -// -//go:noescape -func gfpSqrAsm(res, in *gfP, n int) - -// Marshal gfP into big endian form -// -//go:noescape -func gfpMarshalAsm(out *[32]byte, in *gfP) - -// Unmarshal the bytes into little endian form -// -//go:noescape -func gfpUnmarshalAsm(out *gfP, in *[32]byte) diff --git a/sm9/bn256/gfp_ppc64x.s b/sm9/bn256/gfp_ppc64x.s index ebbd0d4..86eed9c 100644 --- a/sm9/bn256/gfp_ppc64x.s +++ b/sm9/bn256/gfp_ppc64x.s @@ -7,19 +7,20 @@ #include "textflag.h" //func gfpUnmarshal(out *gfP, in *[32]byte) -TEXT ·gfpUnmarshalAsm(SB), NOSPLIT, $0-16 +TEXT ·gfpUnmarshal(SB), NOSPLIT, $0-16 MOVD res+0(FP), R3 MOVD in+8(FP), R4 BR gfpInternalEndianSwap<>(SB) // func gfpMarshal(out *[32]byte, in *gfP) -TEXT ·gfpMarshalAsm(SB), NOSPLIT, $0-16 +TEXT ·gfpMarshal(SB), NOSPLIT, $0-16 MOVD res+0(FP), R3 MOVD in+8(FP), R4 BR gfpInternalEndianSwap<>(SB) TEXT gfpInternalEndianSwap<>(SB), NOSPLIT, $0-0 // Index registers needed for BR movs +#ifdef GOARCH_ppc64le MOVD $8, R9 MOVD $16, R10 MOVD $24, R14 @@ -33,7 +34,17 @@ TEXT gfpInternalEndianSwap<>(SB), NOSPLIT, $0-0 MOVD R7, 8(R3) MOVD R6, 16(R3) MOVD R5, 24(R3) +#else + MOVD $16, R10 + LXVD2X (R4)(R0), V0 + LXVD2X (R4)(R10), V1 + XXPERMDI V0, V0, $2, V0 + XXPERMDI V1, V1, $2, V1 + + STXVD2X V1, (R0+R3) + STXVD2X V0, (R10+R3) +#endif RET #define X1L V0 @@ -70,7 +81,7 @@ TEXT gfpInternalEndianSwap<>(SB), NOSPLIT, $0-0 VSEL TT0, T0, SEL1, T0 \ VSEL TT1, T1, SEL1, T1 \ -TEXT ·gfpNegAsm(SB),0,$0-16 +TEXT ·gfpNeg(SB),0,$0-16 MOVD c+0(FP), R3 MOVD a+8(FP), R4 @@ -98,7 +109,7 @@ TEXT ·gfpNegAsm(SB),0,$0-16 STXVD2X T1, (R5+R3) RET -TEXT ·gfpSubAsm(SB),0,$0-24 +TEXT ·gfpSub(SB),0,$0-24 MOVD c+0(FP), R3 MOVD a+8(FP), R4 MOVD b+16(FP), R5 @@ -144,7 +155,7 @@ TEXT ·gfpSubAsm(SB),0,$0-24 VSEL TT0, T0, SEL1, T0 \ VSEL TT1, T1, SEL1, T1 -TEXT ·gfpAddAsm(SB),0,$0-24 +TEXT ·gfpAdd(SB),0,$0-24 MOVD c+0(FP), R3 MOVD a+8(FP), R4 MOVD b+16(FP), R5 @@ -177,7 +188,7 @@ TEXT ·gfpAddAsm(SB),0,$0-24 STXVD2X T1, (R6+R3) RET -TEXT ·gfpDoubleAsm(SB),0,$0-16 +TEXT ·gfpDouble(SB),0,$0-16 MOVD c+0(FP), R3 MOVD a+8(FP), R4 @@ -204,7 +215,7 @@ TEXT ·gfpDoubleAsm(SB),0,$0-16 STXVD2X T1, (R6+R3) RET -TEXT ·gfpTripleAsm(SB),0,$0-16 +TEXT ·gfpTriple(SB),0,$0-16 MOVD c+0(FP), R3 MOVD a+8(FP), R4 @@ -726,7 +737,7 @@ TEXT gfpMulInternal<>(SB), NOSPLIT, $0 #define T1 V7 #define K0 V31 -TEXT ·gfpMulAsm(SB),NOSPLIT,$0 +TEXT ·gfpMul(SB),NOSPLIT,$0 MOVD c+0(FP), res_ptr MOVD a+8(FP), x_ptr MOVD b+16(FP), y_ptr @@ -766,7 +777,7 @@ TEXT ·gfpMulAsm(SB),NOSPLIT,$0 RET // func gfpSqr(res, in *gfP, n int) -TEXT ·gfpSqrAsm(SB),NOSPLIT,$0 +TEXT ·gfpSqr(SB),NOSPLIT,$0 MOVD res+0(FP), res_ptr MOVD in+8(FP), x_ptr MOVD n+16(FP), N @@ -825,3 +836,102 @@ done: #undef T0 #undef T1 #undef K0 + + +/* ---------------------------------------*/ +#define res_ptr R3 +#define x_ptr R4 +#define CPOOL R7 + +#define M0 V5 +#define M1 V4 +#define T0 V6 +#define T1 V7 +#define T2 V8 + +#define ADD1 V16 +#define ADD1H V17 +#define ADD2 V18 +#define ADD2H V19 +#define RED1 V20 +#define RED1H V21 +#define RED2 V22 +#define RED2H V23 +#define CAR1 V24 +#define CAR1M V25 + +#define MK0 V30 +#define K0 V31 + +// TMP1, TMP2 used in +// VMULT macros +#define TMP1 V13 +#define TMP2 V27 +#define ONE V29 // 1s splatted by word +// func gfpFromMont(res, in *gfP) +TEXT ·gfpFromMont(SB),NOSPLIT,$0 + MOVD res+0(FP), res_ptr + MOVD in+8(FP), x_ptr + + MOVD $16, R16 + + LXVD2X (R0)(x_ptr), T0 + LXVD2X (R16)(x_ptr), T1 + + XXPERMDI T0, T0, $2, T0 + XXPERMDI T1, T1, $2, T1 + + MOVD $·p2+0(SB), CPOOL + LXVD2X (CPOOL)(R0), M0 + LXVD2X (CPOOL)(R16), M1 + + XXPERMDI M0, M0, $2, M0 + XXPERMDI M1, M1, $2, M1 + + MOVD $·np+0(SB), CPOOL + LXVD2X (CPOOL)(R0), K0 + VSPLTW $1, K0, K0 + + // ---------------------------------------------------------------------------/ + VSPLTISW $1, ONE + VSPLTISB $0, T2 // VZERO T2 + + MOVD $8, R5 + MOVD R5, CTR + +loop: + VMULUWM T0, K0, MK0 + VSPLTW $3, MK0, MK0 + + VMULT_ADD(M0, MK0, T0, ONE, RED1, RED1H) + VMULT_ADD(M1, MK0, T1, ONE, RED2, RED2H) + + VSLDOI $12, RED2, RED1, RED1 // VSLDB + VSLDOI $12, T2, RED2, RED2 // VSLDB + + VADDCUQ RED1H, RED1, CAR1M // VACCQ + VADDUQM RED1H, RED1, T0 // VAQ + + // << ready for next MK0 + + VADDECUQ RED2H, RED2, CAR1M, T2 // VACCCQ + VADDEUQM RED2H, RED2, CAR1M, T1 // VACQ + + BDNZ loop + // --------------------------------------------------- + VSPLTISB $0, RED1 // VZERO RED1 + VSUBCUQ T0, M0, CAR1 // VSCBIQ + VSUBUQM T0, M0, ADD1 // VSQ + VSUBECUQ T1, M1, CAR1, CAR1M // VSBCBIQ + VSUBEUQM T1, M1, CAR1, ADD2 // VSBIQ + VSUBEUQM T2, RED1, CAR1M, T2 // VSBIQ + + // what output to use, ADD2||ADD1 or T1||T0? + VSEL ADD1, T0, T2, T0 + VSEL ADD2, T1, T2, T1 + + XXPERMDI T0, T0, $2, T0 + XXPERMDI T1, T1, $2, T1 + STXVD2X T0, (R0)(res_ptr) + STXVD2X T1, (R16)(res_ptr) + RET diff --git a/sm9/bn256/gfp_ppc64x_test.go b/sm9/bn256/gfp_ppc64x_test.go deleted file mode 100644 index 947f93b..0000000 --- a/sm9/bn256/gfp_ppc64x_test.go +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2024 Sun Yimin. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -//go:build (ppc64 || ppc64le) && !purego - -package bn256 - -import "testing" - -func TestGfpNegAsm(t *testing.T) { - x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596")) - got := &gfP{} - gfpSubAsm(got, zero, x) - expected := &gfP{} - gfpNegAsm(expected, x) - if *expected != *got { - t.Errorf("got %v, expected %v", got, expected) - } - gfpSubAsm(got, zero, zero) - gfpNegAsm(expected, zero) - if *expected != *got { - t.Errorf("got %v, expected %v", got, expected) - } -} - -func TestGfpAsmBasicOperations(t *testing.T) { - x := fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")) - y := fromBigInt(bigFromHex("3722755292130B08D2AAB97FD34EC120EE265948D19C17ABF9B7213BAF82D65B")) - expectedAdd := fromBigInt(bigFromHex("0691692307d370af56226e57920199fbbe10f216c67fbc9468c7f225a4b1f21f")) - expectedDouble := fromBigInt(bigFromHex("551de7a0ee24723edcf314ff72f478fac1c7c4e7044238acc3913cfbcdaf7d05")) - expectedSub := fromBigInt(bigFromHex("67b381821c52a5624f3304a8149be8461e3bc07adcb872c38aa65051ba53ba97")) - expectedNeg := fromBigInt(bigFromHex("7f1d8aad70909be90358f1d02240062433cc3a0248ded72febb879ec33ce6f22")) - expectedMul := fromBigInt(bigFromHex("3d08bbad376584e4f74bd31f78f716372b96ba8c3f939c12b8d54e79b6489e76")) - expectedMul2 := fromBigInt(bigFromHex("1df94a9e05a559ff38e0ab50cece734dc058d33738ceacaa15986a67cbff1ef6")) - - t.Parallel() - t.Run("add", func(t *testing.T) { - ret := &gfP{} - gfpAddAsm(ret, x, y) - if *expectedAdd != *ret { - t.Errorf("add not same") - } - x1 := &gfP{} - x1.Set(x) - gfpAddAsm(x1, x1, y) - if *expectedAdd != *x1 { - t.Errorf("add not same when add self") - } - }) - - t.Run("double", func(t *testing.T) { - ret := &gfP{} - gfpDoubleAsm(ret, x) - if ret.Equal(expectedDouble) != 1 { - t.Errorf("double not same, got %v, expected %v", ret, expectedDouble) - } - ret.Set(x) - gfpDoubleAsm(ret, ret) - if ret.Equal(expectedDouble) != 1 { - t.Errorf("double not same, got %v, expected %v", ret, expectedDouble) - } - }) - - t.Run("triple", func(t *testing.T) { - expected := &gfP{} - gfpAddAsm(expected, x, expectedDouble) - ret := &gfP{} - ret.Set(x) - gfpTripleAsm(ret, ret) - if ret.Equal(expected) != 1 { - t.Errorf("expected %v, got %v", expected, ret) - } - }) - - t.Run("sub", func(t *testing.T) { - ret := &gfP{} - gfpSubAsm(ret, y, x) - if *expectedSub != *ret { - t.Errorf("sub not same") - } - x1 := &gfP{} - x1.Set(x) - gfpSubAsm(x1, y, x1) - if *expectedSub != *x1 { - t.Errorf("sub not same when sub self") - } - gfpSubAsm(ret, x, x) - if *ret != *zero { - t.Errorf("expected zero") - } - }) - - t.Run("neg", func(t *testing.T) { - ret := &gfP{} - gfpNegAsm(ret, y) - if *expectedNeg != *ret { - t.Errorf("neg not same") - } - ret.Set(y) - gfpNegAsm(ret, ret) - if *expectedNeg != *ret { - t.Errorf("neg not same when neg self") - } - }) - - t.Run("mul", func(t *testing.T) { - ret := &gfP{} - gfpMulAsm(ret, x, y) - if *expectedMul != *ret { - t.Errorf("mul not same") - } - ret.Set(x) - gfpMulAsm(ret, ret, y) - if *expectedMul != *ret { - t.Errorf("mul not same when mul self") - } - }) - - t.Run("square", func(t *testing.T) { - ret, ret1, ret2 := &gfP{}, &gfP{}, &gfP{} - gfpMulAsm(ret, x, y) - gfpMulAsm(ret1, ret, ret) - if *ret1 != *expectedMul2 { - t.Errorf("mul not same") - } - gfpMulAsm(ret1, ret1, ret1) - gfpSqrAsm(ret2, ret, 2) - if *ret1 != *ret2 { - t.Errorf("mul/sqr not same") - } - ret2.Set(ret) - gfpSqrAsm(ret2, ret2, 2) - if *ret1 != *ret2 { - t.Errorf("mul/sqr not same when square self") - } - }) -}