diff --git a/sm3/kdf_amd64.go b/sm3/kdf_amd64.go new file mode 100644 index 0000000..a7f5a7b --- /dev/null +++ b/sm3/kdf_amd64.go @@ -0,0 +1,11 @@ +//go:build !purego + +package sm3 + +func kdf(baseMD *digest, keyLen int, limit int) []byte { + if limit < 4 { + return kdfGeneric(baseMD, keyLen, limit) + } + + return kdfBy4(baseMD, keyLen, limit) +} diff --git a/sm3/kdf_arm64.go b/sm3/kdf_arm64.go new file mode 100644 index 0000000..fd62aa4 --- /dev/null +++ b/sm3/kdf_arm64.go @@ -0,0 +1,10 @@ +//go:build !purego + +package sm3 + +func kdf(baseMD *digest, keyLen int, limit int) []byte { + if useSM3NI || limit < 4 { + return kdfGeneric(baseMD, keyLen, limit) + } + return kdfBy4(baseMD, keyLen, limit) +} diff --git a/sm3/kdf_generic.go b/sm3/kdf_generic.go new file mode 100644 index 0000000..f6cd786 --- /dev/null +++ b/sm3/kdf_generic.go @@ -0,0 +1,7 @@ +//go:build purego || !(amd64 || arm64) + +package sm3 + +func kdf(baseMD *digest, keyLen int, limit int) []byte { + return kdfGeneric(baseMD, keyLen, limit) +} diff --git a/sm3/kdf_mult_asm.go b/sm3/kdf_mult_asm.go new file mode 100644 index 0000000..b289e46 --- /dev/null +++ b/sm3/kdf_mult_asm.go @@ -0,0 +1,97 @@ +//go:build (amd64 || arm64) && !purego + +package sm3 + +import "encoding/binary" + +func prepareData(baseMD *digest, p []byte, ct uint32, len, t uint64) { + if baseMD.nx > 0 { + copy(p, baseMD.x[:baseMD.nx]) + } + binary.BigEndian.PutUint32(p[baseMD.nx:], ct) + // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64. + var tmp [64 + 8]byte // padding + length buffer + tmp[0] = 0x80 + padlen := tmp[:t+8] + binary.BigEndian.PutUint64(padlen[t:], len) + copy(p[baseMD.nx+4:], padlen) +} + +func copyResult(result []byte, dig *[8]uint32) { + binary.BigEndian.PutUint32(result[0:], dig[0]) + binary.BigEndian.PutUint32(result[4:], dig[1]) + binary.BigEndian.PutUint32(result[8:], dig[2]) + binary.BigEndian.PutUint32(result[12:], dig[3]) + binary.BigEndian.PutUint32(result[16:], dig[4]) + binary.BigEndian.PutUint32(result[20:], dig[5]) + binary.BigEndian.PutUint32(result[24:], dig[6]) + binary.BigEndian.PutUint32(result[28:], dig[7]) +} + +// 1216 = 68 * 4 * 4 + 8 * 4 * 4 = 76 * 16 +// 64 * 2 * 4 = 512 +const preallocSize = 1728 + +func kdfBy4(baseMD *digest, keyLen int, limit int) []byte { + if limit < 4 { + return kdfGeneric(baseMD, keyLen, limit) + } + + var t uint64 + blocks := 1 + len := baseMD.len + 4 + remainlen := len % 64 + if remainlen < 56 { + t = 56 - remainlen + } else { + t = 64 + 56 - remainlen + blocks = 2 + } + len <<= 3 + // prepare temporary buffer + tmpStart := 4 * blocks * BlockSize + buffer := make([]byte, preallocSize) + tmp := buffer[tmpStart:] + // prepare processing data + var data [4]*byte + var digs [4]*[8]uint32 + var states [4][8]uint32 + for j := 0; j < 4; j++ { + digs[j] = &states[j] + } + + var ct uint32 = 1 + k := make([]byte, keyLen) + ret := k + times := limit / 4 + for i := 0; i < times; i++ { + for j := 0; j < 4; j++ { + // prepare states + states[j] = baseMD.h + // prepare data + p := buffer[blocks*BlockSize*j:] + data[j] = &p[0] + prepareData(baseMD, p, ct, len, t) + ct++ + } + blockMultBy4(&digs[0], &data[0], &tmp[0], blocks) + for j := 0; j < 4; j++ { + copyResult(ret, digs[j]) + ret = ret[Size:] + } + } + remain := limit % 4 + for i := 0; i < remain; i++ { + binary.BigEndian.PutUint32(tmp[:], ct) + md := *baseMD + md.Write(tmp[:4]) + h := md.checkSum() + copy(ret[i*Size:], h[:]) + ct++ + } + + return k +} + +//go:noescape +func blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) diff --git a/sm3/sm3.go b/sm3/sm3.go index c014689..409487b 100644 --- a/sm3/sm3.go +++ b/sm3/sm3.go @@ -218,12 +218,16 @@ func (baseMD *digest) Kdf(z []byte, keyLen int) []byte { if limit >= uint64(1<<32)-1 { panic("sm3: key length too long") } + baseMD.Reset() + baseMD.Write(z) + return kdf(baseMD, keyLen, int(limit)) +} + +func kdfGeneric(baseMD *digest, keyLen int, limit int) []byte { var countBytes [4]byte var ct uint32 = 1 k := make([]byte, keyLen) - baseMD.Reset() - baseMD.Write(z) - for i := 0; i < int(limit); i++ { + for i := 0; i < limit; i++ { binary.BigEndian.PutUint32(countBytes[:], ct) md := *baseMD md.Write(countBytes[:]) diff --git a/sm3/sm3_test.go b/sm3/sm3_test.go index f442735..89e7a35 100644 --- a/sm3/sm3_test.go +++ b/sm3/sm3_test.go @@ -420,12 +420,14 @@ func TestKdf(t *testing.T) { {"sm3 case 3", args{[]byte("emmansun"), 48}, "708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"}, {"sm3 case 4", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 48}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f"}, {"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"}, + {"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"}, + {"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"}, } for _, tt := range tests { wantBytes, _ := hex.DecodeString(tt.want) t.Run(tt.name, func(t *testing.T) { if got := Kdf(tt.args.z, tt.args.len); !reflect.DeepEqual(got, wantBytes) { - t.Errorf("Kdf(%v) = %x, want %v", tt.name, got, tt.want) + t.Errorf("Kdf(%v,kLen=%v,zLen=%v) = %x, want %v", tt.name, tt.args.len, len(tt.args.z,), got, tt.want) } }) } diff --git a/sm3/sm3blocks_arm64.s b/sm3/sm3blocks_arm64.s new file mode 100644 index 0000000..d4bf642 --- /dev/null +++ b/sm3/sm3blocks_arm64.s @@ -0,0 +1,309 @@ +//go:build !purego + +#include "textflag.h" +#include "sm3_const_asm.s" + +#define a V0 +#define b V1 +#define c V2 +#define d V3 +#define e V4 +#define f V5 +#define g V6 +#define h V7 + +#define tmp1 V8 +#define tmp2 V9 +#define tmp3 V10 +#define tmp4 V11 + +// input: from high to low +// t0 = t0.S3, t0.S2, t0.S1, t0.S0 +// t1 = t1.S3, t1.S2, t1.S1, t1.S0 +// t2 = t2.S3, t2.S2, t2.S1, t2.S0 +// t3 = t3.S3, t3.S2, t3.S1, t3.S0 +// output: from high to low +// t0 = t3.S0, t2.S0, t1.S0, t0.S0 +// t1 = t3.S1, t2.S1, t1.S1, t0.S1 +// t2 = t3.S2, t2.S2, t1.S2, t0.S2 +// t3 = t3.S3, t2.S3, t1.S3, t0.S3 +#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \ + VZIP1 t1.S4, t0.S4, RTMP0.S4 \ + VZIP1 t3.S4, t2.S4, RTMP1.S4 \ + VZIP2 t1.S4, t0.S4, RTMP2.S4 \ + VZIP2 t3.S4, t2.S4, RTMP3.S4 \ + VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \ + VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \ + VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \ + VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 + +// d = s <<< n +#define PROLD(s, d, n) \ + VSHL $(n), s.S4, d.S4 + VSRI $(32-n), s.S4, d.S4 + +#define loadWordByIndex(W, i) \ + ADD $(16*i), wordStart, R20 + VLD1 (R20), [W.S4] + +#define prepare4Words \ + VLD1.P 16(srcPtr1), [V12.B16] \ + VLD1.P 16(srcPtr2), [V13.B16] \ + VLD1.P 16(srcPtr3), [V14.B16] \ + VLD1.P 16(srcPtr4), [V15.B16] \ + TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \ + VREV32 V12, V12; \ + VREV32 V13, V13; \ + VREV32 V14, V14; \ + VREV32 V15, V15; \ + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr) + +#define LOAD_T(const, T) \ + MOVD $const, R20 \ + VDUP R20, T.S4 \ + +#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ + PROLD(a, V12, 12) \ + VMOV V12.B16, V13.B16 \ + LOAD_T(const, tmp1) \ + VADD tmp1.S4, V12.S4, V12.S4 \ + VADD e.S4, V12.S4, V12.S4 \ + PROLD(V12, V14, 7) \ // V14 = SS1 + VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2 + VEOR a.B16, b.B16, V13.B16 \ + VEOR c.B16, V13.B16, V13.B16 \ + VADD V13.S4, d.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + loadWordByIndex(V10, index) \ + loadWordByIndex(V11, index+4) \ + VEOR V10.B16, V11.B16, V11.B16 \ + VADD V11.S4, V13.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4) + VADD V13.S4, V12.S4, V13.S4 \ // TT1 + VADD h.S4, V10.S4, V10.S4 \ + VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1 + VEOR e.B16, f.B16, V11.B16 \ + VEOR g.B16, V11.B16, V11.B16 \ + VADD V11.S4, V10.S4, V10.S4 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 + VMOV b.B16, V11.B16 \ + PROLD(V11, b, 9) \ // b = b <<< 9 + VMOV V13.B16, h.B16 \ // h = TT1 + VMOV f.B16, V11.B16 \ + PROLD(V11, f, 19) \ // f = f <<< 19 + PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9 + PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17 + VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9) + VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) + +#define MESSAGE_SCHEDULE(index) \ + loadWordByIndex(V10, index+1) \ // Wj-3 + PROLD(V10, V11, 15) \ + loadWordByIndex(V10, index-12) \ // Wj-16 + VEOR V10.B16, V11.B16, V10.B16 \ + loadWordByIndex(V11, index-5) \ // Wj-9 + VEOR V10.B16, V11.B16, V10.B16 \ + PROLD(V10, V11, 15) \ + PROLD(V11, V12, 8) \ + VEOR V11.B16, V10.B16, V10.B16 \ + VEOR V12.B16, V10.B16, V10.B16 \ // P1 + loadWordByIndex(V11, index-9) \ // Wj-13 + PROLD(V11, V12, 7) \ + VEOR V12.B16, V10.B16, V10.B16 \ + loadWordByIndex(V11, index-2) \ // Wj-6 + VEOR V11.B16, V10.B16, V11.B16 \ + VST1.P V11.S4, 16(wordPtr) \ + +#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ + MESSAGE_SCHEDULE(index) \ + ROUND_00_11(index, a, b, c, d, e, f, g, h) \ + +#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ + MESSAGE_SCHEDULE(index); \ // V11 is Wt+4 now, Pls do not use it + PROLD(a, V12, 12) \ + VMOV V12.B16, V13.B16 \ + LOAD_T(const, tmp1) \ + VADD tmp1.S4, V12.S4, V12.S4 \ + VADD e.S4, V12.S4, V12.S4 \ + PROLD(V12, V14, 7) \ // V14 = SS1 + VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2 + VORR a.B16, b.B16, V10.B16 \ + VAND a.B16, b.B16, V13.B16 \ + VAND c.B16, V10.B16, V10.B16 \ + VORR V13.B16, V10.B16, V13.B16 \ // (a AND b) OR (a AND c) OR (b AND c) + VADD V13.S4, d.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + loadWordByIndex(V10, index) \ // Wj + VEOR V10.B16, V11.B16, V11.B16 \ // Wj XOR Wj+4 + VADD V13.S4, V11.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) + VADD V13.S4, V12.S4, V13.S4 \ // TT1 + VADD h.S4, V10.S4, V10.S4 \ // Wt + h + VADD V12.S4, V10.S4, V10.S4 \ // Wt + h + SS1 + VEOR f.B16, g.B16, V11.B16 \ + VAND V11.B16, e.B16, V11.B16 \ + VEOR g.B16, v11.B16, V11.B16 \ // (f XOR g) AND e XOR g + VADD V14.S4, V11.S4, V10.S4 \ // TT2 + VMOV b.B16, V11.B16 \ + PROLD(V11, b, 9) \ // b = b <<< 9 + VMOV V13.B16, h.B16 \ // h = TT1 + VMOV f.B16, V11.B16 \ + PROLD(V11, f, 19) \ // f = f <<< 19 + PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9 + PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17 + VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9) + VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) + +// func blockMultBy4(dig *digest, p []byte) +TEXT ·blockMultBy4(SB), NOSPLIT, $0 +#define digPtr R0 +#define srcPtrPtr R1 +#define statePtr R2 +#define blockCount R3 +#define digSave R4 +#define wordStart R5 +#define srcPtr1 R6 +#define srcPtr2 R7 +#define srcPtr3 R8 +#define srcPtr4 R9 +#define wordPtr R10 + MOVD dig+0(FP), digPtr + MOVD p+8(FP), srcPtrPtr + MOVD buffer+16(FP), statePtr + MOVD blocks+24(FP), blockCount + + // load state + MOVD digPtr, digSave + MOVD.P 8(digPtr), R20 + VLD1.P 16(R20), [a.S4] + VLD1 (R20), [e.S4] + MOVD.P 8(digPtr), R20 + VLD1.P 16(R20), [b.S4] + VLD1 (R20), [f.S4] + MOVD.P 8(digPtr), R20 + VLD1.P 16(R20), [c.S4] + VLD1 (R20), [g.S4] + MOVD (digPtr), R20 + VLD1.P 16(R20), [d.S4] + VLD1 (R20), [h.S4] + + // transpose state + TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4) + TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) + + // store state to temporary buffer + MOVD statePtr, wordStart + VST1.P [a.S4, b.S4, c.S4, d.S4], 64(wordStart) + VST1.P [e.S4, f.S4, g.S4, h.S4], 64(wordStart) + MOVD wordStart, wordPtr + + MOVD.P 8(srcPtrPtr), srcPtr1 + MOVD.P 8(srcPtrPtr), srcPtr2 + MOVD.P 8(srcPtrPtr), srcPtr3 + MOVD (srcPtrPtr), srcPtr4 + +loop: + // load message block + prepare4Words + prepare4Words + prepare4Words + prepare4Words + + ROUND_00_11(0, T0, a, b, c, d, e, f, g, h) + ROUND_00_11(1, T1, h, a, b, c, d, e, f, g) + ROUND_00_11(2, T2, g, h, a, b, c, d, e, f) + ROUND_00_11(3, T3, f, g, h, a, b, c, d, e) + ROUND_00_11(4, T4, e, f, g, h, a, b, c, d) + ROUND_00_11(5, T5, d, e, f, g, h, a, b, c) + ROUND_00_11(6, T6, c, d, e, f, g, h, a, b) + ROUND_00_11(7, T7, b, c, d, e, f, g, h, a) + ROUND_00_11(8, T8, a, b, c, d, e, f, g, h) + ROUND_00_11(9, T9, h, a, b, c, d, e, f, g) + ROUND_00_11(10, T10, g, h, a, b, c, d, e, f) + ROUND_00_11(11, T11, f, g, h, a, b, c, d, e) + + ROUND_12_15(12, T12, e, f, g, h, a, b, c, d) + ROUND_12_15(13, T13, d, e, f, g, h, a, b, c) + ROUND_12_15(14, T14, c, d, e, f, g, h, a, b) + ROUND_12_15(15, T15, b, c, d, e, f, g, h, a) + + ROUND_16_63(16, T16, a, b, c, d, e, f, g, h) + ROUND_16_63(17, T17, h, a, b, c, d, e, f, g) + ROUND_16_63(18, T18, g, h, a, b, c, d, e, f) + ROUND_16_63(19, T19, f, g, h, a, b, c, d, e) + ROUND_16_63(20, T20, e, f, g, h, a, b, c, d) + ROUND_16_63(21, T21, d, e, f, g, h, a, b, c) + ROUND_16_63(22, T22, c, d, e, f, g, h, a, b) + ROUND_16_63(23, T23, b, c, d, e, f, g, h, a) + ROUND_16_63(24, T24, a, b, c, d, e, f, g, h) + ROUND_16_63(25, T25, h, a, b, c, d, e, f, g) + ROUND_16_63(26, T26, g, h, a, b, c, d, e, f) + ROUND_16_63(27, T27, f, g, h, a, b, c, d, e) + ROUND_16_63(28, T28, e, f, g, h, a, b, c, d) + ROUND_16_63(29, T29, d, e, f, g, h, a, b, c) + ROUND_16_63(30, T30, c, d, e, f, g, h, a, b) + ROUND_16_63(31, T31, b, c, d, e, f, g, h, a) + ROUND_16_63(32, T32, a, b, c, d, e, f, g, h) + ROUND_16_63(33, T33, h, a, b, c, d, e, f, g) + ROUND_16_63(34, T34, g, h, a, b, c, d, e, f) + ROUND_16_63(35, T35, f, g, h, a, b, c, d, e) + ROUND_16_63(36, T36, e, f, g, h, a, b, c, d) + ROUND_16_63(37, T37, d, e, f, g, h, a, b, c) + ROUND_16_63(38, T38, c, d, e, f, g, h, a, b) + ROUND_16_63(39, T39, b, c, d, e, f, g, h, a) + ROUND_16_63(40, T40, a, b, c, d, e, f, g, h) + ROUND_16_63(41, T41, h, a, b, c, d, e, f, g) + ROUND_16_63(42, T42, g, h, a, b, c, d, e, f) + ROUND_16_63(43, T43, f, g, h, a, b, c, d, e) + ROUND_16_63(44, T44, e, f, g, h, a, b, c, d) + ROUND_16_63(45, T45, d, e, f, g, h, a, b, c) + ROUND_16_63(46, T46, c, d, e, f, g, h, a, b) + ROUND_16_63(47, T47, b, c, d, e, f, g, h, a) + ROUND_16_63(48, T16, a, b, c, d, e, f, g, h) + ROUND_16_63(49, T17, h, a, b, c, d, e, f, g) + ROUND_16_63(50, T18, g, h, a, b, c, d, e, f) + ROUND_16_63(51, T19, f, g, h, a, b, c, d, e) + ROUND_16_63(52, T20, e, f, g, h, a, b, c, d) + ROUND_16_63(53, T21, d, e, f, g, h, a, b, c) + ROUND_16_63(54, T22, c, d, e, f, g, h, a, b) + ROUND_16_63(55, T23, b, c, d, e, f, g, h, a) + ROUND_16_63(56, T24, a, b, c, d, e, f, g, h) + ROUND_16_63(57, T25, h, a, b, c, d, e, f, g) + ROUND_16_63(58, T26, g, h, a, b, c, d, e, f) + ROUND_16_63(59, T27, f, g, h, a, b, c, d, e) + ROUND_16_63(60, T28, e, f, g, h, a, b, c, d) + ROUND_16_63(61, T29, d, e, f, g, h, a, b, c) + ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) + ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) + + MOVD statePtr, R20 + VLD1.P 64(R20), [V8.S4, V9.S4, V10.S4, V11.S4] + VLD1 (R20), [V12.S4, V13.S4, V14.S4, V15.S4] + VEOR a.B16, V8.B16, a.B16 + VEOR b.B16, V9.B16, b.B16 + VEOR c.B16, V10.B16, c.B16 + VEOR d.B16, V11.B16, d.B16 + VEOR e.B16, V12.B16, e.B16 + VEOR f.B16, V13.B16, f.B16 + VEOR g.B16, V14.B16, g.B16 + VEOR h.B16, V15.B16, h.B16 + MOVD statePtr, R20 + VST1.P [a.S4, b.S4, c.S4, d.S4], 64(R20) + VST1 [e.S4, f.S4, g.S4, h.S4], (R20) + + SUB $1, blockCount + CBNZ blockCount, loop + + // transpose state + TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4) + TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) + + MOVD.P 8(digSave), R20 + VST1.P [a.S4], 16(R20) + VST1 [e.S4], (R20) + MOVD.P 8(digSave), R20 + VST1.P [b.S4], 16(R20) + VST1 [f.S4], (R20) + MOVD.P 8(digSave), R20 + VST1.P [c.S4], 16(R20) + VST1 [g.S4], (R20) + MOVD (digSave), R20 + VST1.P [d.S4], 16(R20) + VST1 [h.S4], (R20) + + RET diff --git a/sm3/sm3blocks_simd_amd64.s b/sm3/sm3blocks_simd_amd64.s new file mode 100644 index 0000000..47def0b --- /dev/null +++ b/sm3/sm3blocks_simd_amd64.s @@ -0,0 +1,736 @@ +//go:build !purego + +#include "textflag.h" + +// shuffle byte order from LE to BE +DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 +DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b +GLOBL flip_mask<>(SB), RODATA, $16 + +DATA T256_4<>+0x00(SB)/8, $0x79cc451979cc4519 +DATA T256_4<>+0x08(SB)/8, $0x79cc451979cc4519 +DATA T256_4<>+0x10(SB)/8, $0xf3988a32f3988a32 +DATA T256_4<>+0x18(SB)/8, $0xf3988a32f3988a32 +DATA T256_4<>+0x20(SB)/8, $0xe7311465e7311465 +DATA T256_4<>+0x28(SB)/8, $0xe7311465e7311465 +DATA T256_4<>+0x30(SB)/8, $0xce6228cbce6228cb +DATA T256_4<>+0x38(SB)/8, $0xce6228cbce6228cb +DATA T256_4<>+0x40(SB)/8, $0x9cc451979cc45197 +DATA T256_4<>+0x48(SB)/8, $0x9cc451979cc45197 +DATA T256_4<>+0x50(SB)/8, $0x3988a32f3988a32f +DATA T256_4<>+0x58(SB)/8, $0x3988a32f3988a32f +DATA T256_4<>+0x60(SB)/8, $0x7311465e7311465e +DATA T256_4<>+0x68(SB)/8, $0x7311465e7311465e +DATA T256_4<>+0x70(SB)/8, $0xe6228cbce6228cbc +DATA T256_4<>+0x78(SB)/8, $0xe6228cbce6228cbc +DATA T256_4<>+0x80(SB)/8, $0xcc451979cc451979 +DATA T256_4<>+0x88(SB)/8, $0xcc451979cc451979 +DATA T256_4<>+0x90(SB)/8, $0x988a32f3988a32f3 +DATA T256_4<>+0x98(SB)/8, $0x988a32f3988a32f3 +DATA T256_4<>+0xa0(SB)/8, $0x311465e7311465e7 +DATA T256_4<>+0xa8(SB)/8, $0x311465e7311465e7 +DATA T256_4<>+0xb0(SB)/8, $0x6228cbce6228cbce +DATA T256_4<>+0xb8(SB)/8, $0x6228cbce6228cbce +DATA T256_4<>+0xc0(SB)/8, $0xc451979cc451979c +DATA T256_4<>+0xc8(SB)/8, $0xc451979cc451979c +DATA T256_4<>+0xd0(SB)/8, $0x88a32f3988a32f39 +DATA T256_4<>+0xd8(SB)/8, $0x88a32f3988a32f39 +DATA T256_4<>+0xe0(SB)/8, $0x11465e7311465e73 +DATA T256_4<>+0xe8(SB)/8, $0x11465e7311465e73 +DATA T256_4<>+0xf0(SB)/8, $0x228cbce6228cbce6 +DATA T256_4<>+0xf8(SB)/8, $0x228cbce6228cbce6 +DATA T256_4<>+0x0100(SB)/8, $0x9d8a7a879d8a7a87 +DATA T256_4<>+0x0108(SB)/8, $0x9d8a7a879d8a7a87 +DATA T256_4<>+0x0110(SB)/8, $0x3b14f50f3b14f50f +DATA T256_4<>+0x0118(SB)/8, $0x3b14f50f3b14f50f +DATA T256_4<>+0x0120(SB)/8, $0x7629ea1e7629ea1e +DATA T256_4<>+0x0128(SB)/8, $0x7629ea1e7629ea1e +DATA T256_4<>+0x0130(SB)/8, $0xec53d43cec53d43c +DATA T256_4<>+0x0138(SB)/8, $0xec53d43cec53d43c +DATA T256_4<>+0x0140(SB)/8, $0xd8a7a879d8a7a879 +DATA T256_4<>+0x0148(SB)/8, $0xd8a7a879d8a7a879 +DATA T256_4<>+0x0150(SB)/8, $0xb14f50f3b14f50f3 +DATA T256_4<>+0x0158(SB)/8, $0xb14f50f3b14f50f3 +DATA T256_4<>+0x0160(SB)/8, $0x629ea1e7629ea1e7 +DATA T256_4<>+0x0168(SB)/8, $0x629ea1e7629ea1e7 +DATA T256_4<>+0x0170(SB)/8, $0xc53d43cec53d43ce +DATA T256_4<>+0x0178(SB)/8, $0xc53d43cec53d43ce +DATA T256_4<>+0x0180(SB)/8, $0x8a7a879d8a7a879d +DATA T256_4<>+0x0188(SB)/8, $0x8a7a879d8a7a879d +DATA T256_4<>+0x0190(SB)/8, $0x14f50f3b14f50f3b +DATA T256_4<>+0x0198(SB)/8, $0x14f50f3b14f50f3b +DATA T256_4<>+0x01a0(SB)/8, $0x29ea1e7629ea1e76 +DATA T256_4<>+0x01a8(SB)/8, $0x29ea1e7629ea1e76 +DATA T256_4<>+0x01b0(SB)/8, $0x53d43cec53d43cec +DATA T256_4<>+0x01b8(SB)/8, $0x53d43cec53d43cec +DATA T256_4<>+0x01c0(SB)/8, $0xa7a879d8a7a879d8 +DATA T256_4<>+0x01c8(SB)/8, $0xa7a879d8a7a879d8 +DATA T256_4<>+0x01d0(SB)/8, $0x4f50f3b14f50f3b1 +DATA T256_4<>+0x01d8(SB)/8, $0x4f50f3b14f50f3b1 +DATA T256_4<>+0x01e0(SB)/8, $0x9ea1e7629ea1e762 +DATA T256_4<>+0x01e8(SB)/8, $0x9ea1e7629ea1e762 +DATA T256_4<>+0x01f0(SB)/8, $0x3d43cec53d43cec5 +DATA T256_4<>+0x01f8(SB)/8, $0x3d43cec53d43cec5 +DATA T256_4<>+0x0200(SB)/8, $0x7a879d8a7a879d8a +DATA T256_4<>+0x0208(SB)/8, $0x7a879d8a7a879d8a +DATA T256_4<>+0x0210(SB)/8, $0xf50f3b14f50f3b14 +DATA T256_4<>+0x0218(SB)/8, $0xf50f3b14f50f3b14 +DATA T256_4<>+0x0220(SB)/8, $0xea1e7629ea1e7629 +DATA T256_4<>+0x0228(SB)/8, $0xea1e7629ea1e7629 +DATA T256_4<>+0x0230(SB)/8, $0xd43cec53d43cec53 +DATA T256_4<>+0x0238(SB)/8, $0xd43cec53d43cec53 +DATA T256_4<>+0x0240(SB)/8, $0xa879d8a7a879d8a7 +DATA T256_4<>+0x0248(SB)/8, $0xa879d8a7a879d8a7 +DATA T256_4<>+0x0250(SB)/8, $0x50f3b14f50f3b14f +DATA T256_4<>+0x0258(SB)/8, $0x50f3b14f50f3b14f +DATA T256_4<>+0x0260(SB)/8, $0xa1e7629ea1e7629e +DATA T256_4<>+0x0268(SB)/8, $0xa1e7629ea1e7629e +DATA T256_4<>+0x0270(SB)/8, $0x43cec53d43cec53d +DATA T256_4<>+0x0278(SB)/8, $0x43cec53d43cec53d +DATA T256_4<>+0x0280(SB)/8, $0x879d8a7a879d8a7a +DATA T256_4<>+0x0288(SB)/8, $0x879d8a7a879d8a7a +DATA T256_4<>+0x0290(SB)/8, $0x0f3b14f50f3b14f5 +DATA T256_4<>+0x0298(SB)/8, $0x0f3b14f50f3b14f5 +DATA T256_4<>+0x02a0(SB)/8, $0x1e7629ea1e7629ea +DATA T256_4<>+0x02a8(SB)/8, $0x1e7629ea1e7629ea +DATA T256_4<>+0x02b0(SB)/8, $0x3cec53d43cec53d4 +DATA T256_4<>+0x02b8(SB)/8, $0x3cec53d43cec53d4 +DATA T256_4<>+0x02c0(SB)/8, $0x79d8a7a879d8a7a8 +DATA T256_4<>+0x02c8(SB)/8, $0x79d8a7a879d8a7a8 +DATA T256_4<>+0x02d0(SB)/8, $0xf3b14f50f3b14f50 +DATA T256_4<>+0x02d8(SB)/8, $0xf3b14f50f3b14f50 +DATA T256_4<>+0x02e0(SB)/8, $0xe7629ea1e7629ea1 +DATA T256_4<>+0x02e8(SB)/8, $0xe7629ea1e7629ea1 +DATA T256_4<>+0x02f0(SB)/8, $0xcec53d43cec53d43 +DATA T256_4<>+0x02f8(SB)/8, $0xcec53d43cec53d43 +GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16 + +// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance! +// input: from high to low +// r0 = [w3, w2, w1, w0] +// r1 = [w7, w6, w5, w4] +// r2 = [w11, w10, w9, w8] +// r3 = [w15, w14, w13, w12] +// r: 32/64 temp register +// tmp1: 128 bits temp register +// tmp2: 128 bits temp register +// +// output: from high to low +// r0 = [w12, w8, w4, w0] +// r1 = [w13, w9, w5, w1] +// r2 = [w14, w10, w6, w2] +// r3 = [w15, w11, w7, w3] +// +// SSE2/MMX instructions: +// MOVOU r0, tmp2; +// PUNPCKHDQ r1, tmp2; +// PUNPCKLDQ r1, r0; +// MOVOU r2, tmp1; +// PUNPCKLDQ r3, tmp1; +// PUNPCKHDQ r3, r2; +// MOVOU r0, r1; +// PUNPCKHQDQ tmp1, r1; +// PUNPCKLQDQ tmp1, r0; +// MOVOU tmp2, r3; +// PUNPCKHQDQ r2, r3; +// PUNPCKLQDQ r2, tmp2; +// MOVOU tmp2, r2 +#define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ + MOVOU r0, tmp2; \ + PUNPCKHLQ r1, tmp2; \ + PUNPCKLLQ r1, r0; \ + MOVOU r2, tmp1; \ + PUNPCKLLQ r3, tmp1; \ + PUNPCKHLQ r3, r2; \ + MOVOU r0, r1; \ + PUNPCKHQDQ tmp1, r1; \ + PUNPCKLQDQ tmp1, r0; \ + MOVOU tmp2, r3; \ + PUNPCKHQDQ r2, r3; \ + PUNPCKLQDQ r2, tmp2; \ + MOVOU tmp2, r2 + +#define a X0 +#define b X1 +#define c X2 +#define d X3 +#define e X4 +#define f X5 +#define g X6 +#define h X7 + +#define tmp1 X8 +#define tmp2 X9 + +#define storeState \ + MOVOU a, (BX) \ + MOVOU b, 16(BX) \ + MOVOU c, 32(BX) \ + MOVOU d, 48(BX) \ + MOVOU e, 64(BX) \ + MOVOU f, 80(BX) \ + MOVOU g, 96(BX) \ + MOVOU h, 112(BX) + +// xorm (mem), reg +// Xor reg to mem using reg-mem xor and store +#define xorm(P1, P2) \ + MOVOU P1, tmp1; \ + PXOR tmp1, P2; \ + MOVOU P2, P1 + +#define store4Words(W, j) MOVOU W, (128+(j)*16)(BX) +#define load4Words(W, i) MOVOU (128+(i)*16)(BX), W + +#define prepareFirst16Words(i) \ + MOVOU (i*16)(R8), X10; \ + MOVOU (i*16)(R9), X11; \ + MOVOU (i*16)(R10), X12; \ + MOVOU (i*16)(R11), X13; \ + ; \ + SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \ + MOVOU flip_mask<>(SB), tmp1; \ + PSHUFB tmp1, X10; \ + PSHUFB tmp1, X11; \ + PSHUFB tmp1, X12; \ + PSHUFB tmp1, X13; \ + ; \ + store4Words(X10, 4*i+0); \ + store4Words(X11, 4*i+1); \ + store4Words(X12, 4*i+2); \ + store4Words(X13, 4*i+3) + +// r <<< n, SSE version +#define PROLD(r, n) \ + MOVOU r, tmp1; \ + PSLLL $n, r; \ + PSRLL $(32-n), tmp1; \ + POR tmp1, r + +#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \ + MOVOU a, X12; \ + PROLD(X12, 12); \ + MOVOU X12, X13; \ // a <<< 12 + MOVOU (index*16)(AX), tmp2; \ + PADDL tmp2, X12; \ + PADDL e, X12; \ + PROLD(X12, 7); \ // SS1 + PXOR X12, X13; \ // SS2 + MOVOU b, X14; \ + PXOR a, X14; \ + PXOR c, X14; \ // (a XOR b XOR c) + PADDL d, X14; \ // (a XOR b XOR c) + d + load4Words(X10, index); \ + load4Words(X11, index+4); \ + PXOR X10, X11; \ //Wt XOR Wt+4 + PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 + PADDL X14, X13; \ // TT1 + PADDL h, X10; \ // Wt + h + PADDL X12, X10; \ // Wt + h + SS1 + MOVOU e, X11; \ + PXOR f, X11; \ + PXOR g, X11; \ // (e XOR f XOR g) + PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 + ; \ // copy result + PROLD(b, 9); \ + MOVOU X13, h; \ + PROLD(f, 19); \ + MOVOU X10, X13; \ + PROLD(X13, 9); \ + PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) + PROLD(X10, 17); \ + PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) + MOVOU X13, d + +#define MESSAGE_SCHEDULE(index) \ + load4Words(X10, index+1); \ // Wj-3 + PROLD(X10, 15); \ + load4Words(X11, index-12); \ // Wj-16 + PXOR X11, X10; \ + load4Words(X11, index-5); \ // Wj-9 + PXOR X11, X10; \ + MOVOU X10, X11; \ + PROLD(X11, 15); \ + PXOR X11, X10; \ + PROLD(X11, 8); \ + PXOR X11, X10; \ // P1 + load4Words(X11, index-9); \ // Wj-13 + PROLD(X11, 7); \ + PXOR X11, X10; \ + load4Words(X11, index-2); \ // Wj-6 + PXOR X10, X11; \ + store4Words(X11, index+4) + +#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ + MESSAGE_SCHEDULE(index); \ + ROUND_00_11(index, a, b, c, d, e, f, g, h) + +#define ROUND_16_63(index, cIndex, a, b, c, d, e, f, g, h) \ + MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it + MOVOU a, X12; \ + PROLD(X12, 12); \ + MOVOU X12, X13; \ // a <<< 12 + MOVOU (cIndex*16)(AX), tmp2; \ + PADDL tmp2, X12; \ + PADDL e, X12; \ + PROLD(X12, 7); \ // SS1 + PXOR X12, X13; \ // SS2 + ; \ + MOVOU a, X14; \ + POR b, X14; \ + MOVOU a, X10; \ + PAND b, X10; \ + PAND c, X14; \ + POR X10, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + load4Words(X10, index); \ + PXOR X10, X11; \ //Wt XOR Wt+4 + PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 + PADDL X14, X13; \ // TT1 + ; \ + PADDL h, X10; \ // Wt + h + PADDL X12, X10; \ // Wt + h + SS1 + MOVOU f, X11; \ + PXOR g, X11; \ + PAND e, X11; \ // (f XOR g) AND e XOR g + PXOR g, X11; \ + PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 + ; \ // copy result + PROLD(b, 9); \ + MOVOU X13, h; \ + PROLD(f, 19); \ + MOVOU X10, X13; \ + PROLD(X13, 9); \ + PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) + PROLD(X10, 17); \ + PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) + MOVOU X13, d + +// transpose matrix function, AVX/AVX2 version +// parameters: +// - r0: 128/256 bits register as input/output data +// - r1: 128/256 bits register as input/output data +// - r2: 128/256 bits register as input/output data +// - r3: 128/256 bits register as input/output data +// - tmp1: 128/256 bits temp register +// - tmp2: 128/256 bits temp register +#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ + VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2] + VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0] + VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8] + VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10] + VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1] + VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0] + VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] + VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] + +// avxXorm (mem), reg +// Xor reg to mem using reg-mem xor and store +#define avxXorm(P1, P2) \ + VPXOR P1, P2, P2; \ + VMOVDQU P2, P1 + +#define avxStore4Words(W, j) VMOVDQU W, (128+(j)*16)(BX) +#define avxLoad4Words(W, i) VMOVDQU (128+(i)*16)(BX), W + +#define avxPrepareFirst16Words(i) \ + VMOVDQU (i*16)(R8), X10; \ + VMOVDQU (i*16)(R9), X11; \ + VMOVDQU (i*16)(R10), X12; \ + VMOVDQU (i*16)(R11), X13; \ + ; \ + TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \ + VPSHUFB flip_mask<>(SB), X10, X10; \ + VPSHUFB flip_mask<>(SB), X11, X11; \ + VPSHUFB flip_mask<>(SB), X12, X12; \ + VPSHUFB flip_mask<>(SB), X13, X13; \ + ; \ + avxStore4Words(X10, 4*i+0); \ + avxStore4Words(X11, 4*i+1); \ + avxStore4Words(X12, 4*i+2); \ + avxStore4Words(X13, 4*i+3) + +// r <<< n +#define VPROLD(r, n) \ + VPSLLD $(n), r, tmp1; \ + VPSRLD $(32-n), r, r; \ + VPOR tmp1, r, r + +// d = r <<< n +#define VPROLD2(r, d, n) \ + VPSLLD $(n), r, tmp1; \ + VPSRLD $(32-n), r, d; \ + VPOR tmp1, d, d + +#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \ + VPROLD2(a, X13, 12); \ // a <<< 12 + VPADDD (index*16)(AX), X13, X12; \ + VPADDD e, X12, X12; \ + VPROLD(X12, 7); \ // SS1 + VPXOR X12, X13, X13; \ // SS2 + ; \ + VPXOR a, b, X14; \ + VPXOR c, X14, X14; \ // (a XOR b XOR c) + VPADDD d, X14, X14; \ // (a XOR b XOR c) + d + avxLoad4Words(X10, index); \ + avxLoad4Words(X11, index+4); \ + VPXOR X10, X11, X11; \ //Wt XOR Wt+4 + VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 + VPADDD X14, X13, X13; \ // TT1 + VPADDD h, X10, X10; \ // Wt + h + VPADDD X12, X10, X10; \ // Wt + h + SS1 + VPXOR e, f, X11; \ + VPXOR g, X11, X11; \ // (e XOR f XOR g) + VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 + ; \ // copy result + VPROLD(b, 9); \ + VMOVDQU X13, h; \ + VPROLD(f, 19); \ + VPROLD2(X10, X13, 9); \ // tt2 <<< 9 + VPXOR X10, X13, X13; \ // tt2 XOR ROTL(9, tt2) + VPROLD(X10, 17); \ // tt2 <<< 17 + VPXOR X10, X13, d + +#define AVX_MESSAGE_SCHEDULE(index) \ + avxLoad4Words(X10, index+1); \ // Wj-3 + VPROLD(X10, 15); \ + VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16 + VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9 + ; \ // P1 + VPROLD2(X10, X11, 15); \ + VPXOR X11, X10, X10; \ + VPROLD(X11, 8); \ + VPXOR X11, X10, X10; \ // P1 + avxLoad4Words(X11, index-9); \ // Wj-13 + VPROLD(X11, 7); \ + VPXOR X11, X10, X10; \ + VPXOR (128+(index-2)*16)(BX), X10, X11; \ + avxStore4Words(X11, index+4) + +#define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \ + AVX_MESSAGE_SCHEDULE(index); \ + AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) + +#define AVX_ROUND_16_63(index, cIndex, a, b, c, d, e, f, g, h) \ + AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it + VPROLD2(a, X13, 12); \ // a <<< 12 + VPADDD (cIndex*16)(AX), X13, X12; \ + VPADDD e, X12, X12; \ + VPROLD(X12, 7); \ // SS1 + VPXOR X12, X13, X13; \ // SS2 + ; \ + VPOR a, b, X14; \ + VPAND a, b, X10; \ + VPAND c, X14, X14; \ + VPOR X10, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + avxLoad4Words(X10, index); \ + VPXOR X10, X11, X11; \ //Wt XOR Wt+4 + VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 + VPADDD X14, X13, X13; \ // TT1 + ; \ + VPADDD h, X10, X10; \ // Wt + h + VPADDD X12, X10, X10; \ // Wt + h + SS1 + VPXOR f, g, X11; \ + VPAND e, X11, X11; \ + VPXOR g, X11, X11; \ // (f XOR g) AND e XOR g + VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 + ; \ // copy result + VPROLD(b, 9); \ + VMOVDQU X13, h; \ + VPROLD(f, 19); \ + VPROLD2(X10, X13, 9); \ // tt2 <<< 9 + VPXOR X10, X13, X13; \ // tt2 XOR ROTL(9, tt2) + VPROLD(X10, 17); \ // tt2 <<< 17 + VPXOR X10, X13, d + +// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int) +TEXT ·blockMultBy4(SB),NOSPLIT,$0 + MOVQ dig+0(FP), DI + MOVQ p+8(FP), SI + MOVQ buffer+16(FP), BX + MOVQ blocks+24(FP), DX + + CMPB ·useAVX(SB), $1 + JE avx + + // load state + MOVQ (DI), R8 + MOVOU (0*16)(R8), a + MOVOU (1*16)(R8), e + MOVQ 8(DI), R8 + MOVOU (0*16)(R8), b + MOVOU (1*16)(R8), f + MOVQ 16(DI), R8 + MOVOU (0*16)(R8), c + MOVOU (1*16)(R8), g + MOVQ 24(DI), R8 + MOVOU (0*16)(R8), d + MOVOU (1*16)(R8), h + + // transpose state + SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) + SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) + + // store state to temporary buffer + storeState + + MOVQ $T256_4<>(SB), AX + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + +loop: + // load message block + prepareFirst16Words(0) + prepareFirst16Words(1) + prepareFirst16Words(2) + prepareFirst16Words(3) + + ROUND_00_11(0, a, b, c, d, e, f, g, h) + ROUND_00_11(1, h, a, b, c, d, e, f, g) + ROUND_00_11(2, g, h, a, b, c, d, e, f) + ROUND_00_11(3, f, g, h, a, b, c, d, e) + ROUND_00_11(4, e, f, g, h, a, b, c, d) + ROUND_00_11(5, d, e, f, g, h, a, b, c) + ROUND_00_11(6, c, d, e, f, g, h, a, b) + ROUND_00_11(7, b, c, d, e, f, g, h, a) + ROUND_00_11(8, a, b, c, d, e, f, g, h) + ROUND_00_11(9, h, a, b, c, d, e, f, g) + ROUND_00_11(10, g, h, a, b, c, d, e, f) + ROUND_00_11(11, f, g, h, a, b, c, d, e) + + ROUND_12_15(12, e, f, g, h, a, b, c, d) + ROUND_12_15(13, d, e, f, g, h, a, b, c) + ROUND_12_15(14, c, d, e, f, g, h, a, b) + ROUND_12_15(15, b, c, d, e, f, g, h, a) + + ROUND_16_63(16, 16, a, b, c, d, e, f, g, h) + ROUND_16_63(17, 17, h, a, b, c, d, e, f, g) + ROUND_16_63(18, 18, g, h, a, b, c, d, e, f) + ROUND_16_63(19, 19, f, g, h, a, b, c, d, e) + ROUND_16_63(20, 20, e, f, g, h, a, b, c, d) + ROUND_16_63(21, 21, d, e, f, g, h, a, b, c) + ROUND_16_63(22, 22, c, d, e, f, g, h, a, b) + ROUND_16_63(23, 23, b, c, d, e, f, g, h, a) + ROUND_16_63(24, 24, a, b, c, d, e, f, g, h) + ROUND_16_63(25, 25, h, a, b, c, d, e, f, g) + ROUND_16_63(26, 26, g, h, a, b, c, d, e, f) + ROUND_16_63(27, 27, f, g, h, a, b, c, d, e) + ROUND_16_63(28, 28, e, f, g, h, a, b, c, d) + ROUND_16_63(29, 29, d, e, f, g, h, a, b, c) + ROUND_16_63(30, 30, c, d, e, f, g, h, a, b) + ROUND_16_63(31, 31, b, c, d, e, f, g, h, a) + ROUND_16_63(32, 32, a, b, c, d, e, f, g, h) + ROUND_16_63(33, 33, h, a, b, c, d, e, f, g) + ROUND_16_63(34, 34, g, h, a, b, c, d, e, f) + ROUND_16_63(35, 35, f, g, h, a, b, c, d, e) + ROUND_16_63(36, 36, e, f, g, h, a, b, c, d) + ROUND_16_63(37, 37, d, e, f, g, h, a, b, c) + ROUND_16_63(38, 38, c, d, e, f, g, h, a, b) + ROUND_16_63(39, 39, b, c, d, e, f, g, h, a) + ROUND_16_63(40, 40, a, b, c, d, e, f, g, h) + ROUND_16_63(41, 41, h, a, b, c, d, e, f, g) + ROUND_16_63(42, 42, g, h, a, b, c, d, e, f) + ROUND_16_63(43, 43, f, g, h, a, b, c, d, e) + ROUND_16_63(44, 44, e, f, g, h, a, b, c, d) + ROUND_16_63(45, 45, d, e, f, g, h, a, b, c) + ROUND_16_63(46, 46, c, d, e, f, g, h, a, b) + ROUND_16_63(47, 47, b, c, d, e, f, g, h, a) + ROUND_16_63(48, 16, a, b, c, d, e, f, g, h) + ROUND_16_63(49, 17, h, a, b, c, d, e, f, g) + ROUND_16_63(50, 18, g, h, a, b, c, d, e, f) + ROUND_16_63(51, 19, f, g, h, a, b, c, d, e) + ROUND_16_63(52, 20, e, f, g, h, a, b, c, d) + ROUND_16_63(53, 21, d, e, f, g, h, a, b, c) + ROUND_16_63(54, 22, c, d, e, f, g, h, a, b) + ROUND_16_63(55, 23, b, c, d, e, f, g, h, a) + ROUND_16_63(56, 24, a, b, c, d, e, f, g, h) + ROUND_16_63(57, 25, h, a, b, c, d, e, f, g) + ROUND_16_63(58, 26, g, h, a, b, c, d, e, f) + ROUND_16_63(59, 27, f, g, h, a, b, c, d, e) + ROUND_16_63(60, 28, e, f, g, h, a, b, c, d) + ROUND_16_63(61, 29, d, e, f, g, h, a, b, c) + ROUND_16_63(62, 30, c, d, e, f, g, h, a, b) + ROUND_16_63(63, 31, b, c, d, e, f, g, h, a) + + xorm( 0(BX), a) + xorm( 16(BX), b) + xorm( 32(BX), c) + xorm( 48(BX), d) + xorm( 64(BX), e) + xorm( 80(BX), f) + xorm( 96(BX), g) + xorm(112(BX), h) + + LEAQ 64(R8), R8 + LEAQ 64(R9), R9 + LEAQ 64(R10), R10 + LEAQ 64(R11), R11 + + DECQ DX + JNZ loop + + // transpose state + SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) + SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) + + MOVQ (DI), R8 + MOVOU a, (0*16)(R8) + MOVOU e, (1*16)(R8) + MOVQ 8(DI), R8 + MOVOU b, (0*16)(R8) + MOVOU f, (1*16)(R8) + MOVQ 16(DI), R8 + MOVOU c, (0*16)(R8) + MOVOU g, (1*16)(R8) + MOVQ 24(DI), R8 + MOVOU d, (0*16)(R8) + MOVOU h, (1*16)(R8) + + RET + +avx: + // load state + MOVQ (DI), R8 + VMOVDQU (0*16)(R8), a + VMOVDQU (1*16)(R8), e + MOVQ 8(DI), R8 + VMOVDQU (0*16)(R8), b + VMOVDQU (1*16)(R8), f + MOVQ 16(DI), R8 + VMOVDQU (0*16)(R8), c + VMOVDQU (1*16)(R8), g + MOVQ 24(DI), R8 + VMOVDQU (0*16)(R8), d + VMOVDQU (1*16)(R8), h + + // transpose state + TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) + TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) + + VMOVDQU a, (BX) + VMOVDQU b, 16(BX) + VMOVDQU c, 32(BX) + VMOVDQU d, 48(BX) + VMOVDQU e, 64(BX) + VMOVDQU f, 80(BX) + VMOVDQU g, 96(BX) + VMOVDQU h, 112(BX) + + MOVQ $T256_4<>(SB), AX + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + +avxLoop: + // load message block + avxPrepareFirst16Words(0) + avxPrepareFirst16Words(1) + avxPrepareFirst16Words(2) + avxPrepareFirst16Words(3) + + AVX_ROUND_00_11(0, a, b, c, d, e, f, g, h) + AVX_ROUND_00_11(1, h, a, b, c, d, e, f, g) + AVX_ROUND_00_11(2, g, h, a, b, c, d, e, f) + AVX_ROUND_00_11(3, f, g, h, a, b, c, d, e) + AVX_ROUND_00_11(4, e, f, g, h, a, b, c, d) + AVX_ROUND_00_11(5, d, e, f, g, h, a, b, c) + AVX_ROUND_00_11(6, c, d, e, f, g, h, a, b) + AVX_ROUND_00_11(7, b, c, d, e, f, g, h, a) + AVX_ROUND_00_11(8, a, b, c, d, e, f, g, h) + AVX_ROUND_00_11(9, h, a, b, c, d, e, f, g) + AVX_ROUND_00_11(10, g, h, a, b, c, d, e, f) + AVX_ROUND_00_11(11, f, g, h, a, b, c, d, e) + + AVX_ROUND_12_15(12, e, f, g, h, a, b, c, d) + AVX_ROUND_12_15(13, d, e, f, g, h, a, b, c) + AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b) + AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a) + + AVX_ROUND_16_63(16, 16, a, b, c, d, e, f, g, h) + AVX_ROUND_16_63(17, 17, h, a, b, c, d, e, f, g) + AVX_ROUND_16_63(18, 18, g, h, a, b, c, d, e, f) + AVX_ROUND_16_63(19, 19, f, g, h, a, b, c, d, e) + AVX_ROUND_16_63(20, 20, e, f, g, h, a, b, c, d) + AVX_ROUND_16_63(21, 21, d, e, f, g, h, a, b, c) + AVX_ROUND_16_63(22, 22, c, d, e, f, g, h, a, b) + AVX_ROUND_16_63(23, 23, b, c, d, e, f, g, h, a) + AVX_ROUND_16_63(24, 24, a, b, c, d, e, f, g, h) + AVX_ROUND_16_63(25, 25, h, a, b, c, d, e, f, g) + AVX_ROUND_16_63(26, 26, g, h, a, b, c, d, e, f) + AVX_ROUND_16_63(27, 27, f, g, h, a, b, c, d, e) + AVX_ROUND_16_63(28, 28, e, f, g, h, a, b, c, d) + AVX_ROUND_16_63(29, 29, d, e, f, g, h, a, b, c) + AVX_ROUND_16_63(30, 30, c, d, e, f, g, h, a, b) + AVX_ROUND_16_63(31, 31, b, c, d, e, f, g, h, a) + AVX_ROUND_16_63(32, 32, a, b, c, d, e, f, g, h) + AVX_ROUND_16_63(33, 33, h, a, b, c, d, e, f, g) + AVX_ROUND_16_63(34, 34, g, h, a, b, c, d, e, f) + AVX_ROUND_16_63(35, 35, f, g, h, a, b, c, d, e) + AVX_ROUND_16_63(36, 36, e, f, g, h, a, b, c, d) + AVX_ROUND_16_63(37, 37, d, e, f, g, h, a, b, c) + AVX_ROUND_16_63(38, 38, c, d, e, f, g, h, a, b) + AVX_ROUND_16_63(39, 39, b, c, d, e, f, g, h, a) + AVX_ROUND_16_63(40, 40, a, b, c, d, e, f, g, h) + AVX_ROUND_16_63(41, 41, h, a, b, c, d, e, f, g) + AVX_ROUND_16_63(42, 42, g, h, a, b, c, d, e, f) + AVX_ROUND_16_63(43, 43, f, g, h, a, b, c, d, e) + AVX_ROUND_16_63(44, 44, e, f, g, h, a, b, c, d) + AVX_ROUND_16_63(45, 45, d, e, f, g, h, a, b, c) + AVX_ROUND_16_63(46, 46, c, d, e, f, g, h, a, b) + AVX_ROUND_16_63(47, 47, b, c, d, e, f, g, h, a) + AVX_ROUND_16_63(48, 16, a, b, c, d, e, f, g, h) + AVX_ROUND_16_63(49, 17, h, a, b, c, d, e, f, g) + AVX_ROUND_16_63(50, 18, g, h, a, b, c, d, e, f) + AVX_ROUND_16_63(51, 19, f, g, h, a, b, c, d, e) + AVX_ROUND_16_63(52, 20, e, f, g, h, a, b, c, d) + AVX_ROUND_16_63(53, 21, d, e, f, g, h, a, b, c) + AVX_ROUND_16_63(54, 22, c, d, e, f, g, h, a, b) + AVX_ROUND_16_63(55, 23, b, c, d, e, f, g, h, a) + AVX_ROUND_16_63(56, 24, a, b, c, d, e, f, g, h) + AVX_ROUND_16_63(57, 25, h, a, b, c, d, e, f, g) + AVX_ROUND_16_63(58, 26, g, h, a, b, c, d, e, f) + AVX_ROUND_16_63(59, 27, f, g, h, a, b, c, d, e) + AVX_ROUND_16_63(60, 28, e, f, g, h, a, b, c, d) + AVX_ROUND_16_63(61, 29, d, e, f, g, h, a, b, c) + AVX_ROUND_16_63(62, 30, c, d, e, f, g, h, a, b) + AVX_ROUND_16_63(63, 31, b, c, d, e, f, g, h, a) + + avxXorm( 0(BX), a) + avxXorm( 16(BX), b) + avxXorm( 32(BX), c) + avxXorm( 48(BX), d) + avxXorm( 64(BX), e) + avxXorm( 80(BX), f) + avxXorm( 96(BX), g) + avxXorm(112(BX), h) + + LEAQ 64(R8), R8 + LEAQ 64(R9), R9 + LEAQ 64(R10), R10 + LEAQ 64(R11), R11 + + DECQ DX + JNZ avxLoop + + // transpose state + TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) + TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) + + MOVQ (DI), R8 + VMOVDQU a, (0*16)(R8) + VMOVDQU e, (1*16)(R8) + MOVQ 8(DI), R8 + VMOVDQU b, (0*16)(R8) + VMOVDQU f, (1*16)(R8) + MOVQ 16(DI), R8 + VMOVDQU c, (0*16)(R8) + VMOVDQU g, (1*16)(R8) + MOVQ 24(DI), R8 + VMOVDQU d, (0*16)(R8) + VMOVDQU h, (1*16)(R8) + + RET diff --git a/sm3/sm3blocks_test.go b/sm3/sm3blocks_test.go new file mode 100644 index 0000000..46fbe19 --- /dev/null +++ b/sm3/sm3blocks_test.go @@ -0,0 +1,117 @@ +//go:build (amd64 || arm64) && !purego + +package sm3 + +import ( + "fmt" + "testing" +) + +func initState4() [4]*[8]uint32 { + d := new(digest) + d.Reset() + var dig1 = d.h + var dig2 = d.h + var dig3 = d.h + return [4]*[8]uint32{&d.h, &dig1, &dig2, &dig3} +} + +func createOneBlockBy4() [4]*byte { + var p1 [64]byte + p1[0] = 0x61 + p1[1] = 0x62 + p1[2] = 0x63 + p1[3] = 0x80 + p1[63] = 0x18 + var p2 = p1 + var p3 = p1 + var p4 = p1 + return [4]*byte{&p1[0], &p2[0], &p3[0], &p4[0]} +} + +func createTwoBlocksBy4() [4]*byte { + var p1 [128]byte + p1[0] = 0x61 + p1[1] = 0x62 + p1[2] = 0x63 + p1[3] = 0x64 + copy(p1[4:], p1[:4]) + copy(p1[8:], p1[:8]) + copy(p1[16:], p1[:16]) + copy(p1[32:], p1[:32]) + p1[64] = 0x80 + p1[126] = 0x02 + var p2 = p1 + var p3 = p1 + var p4 = p1 + return [4]*byte{&p1[0], &p2[0], &p3[0], &p4[0]} +} + +func TestBlockMultBy4(t *testing.T) { + digs := initState4() + p := createOneBlockBy4() + buffer := make([]byte, 1216) + blockMultBy4(&digs[0], &p[0], &buffer[0], 1) + expected := "[66c7f0f4 62eeedd9 d1f2d46b dc10e4e2 4167c487 5cf2f7a2 297da02b 8f4ba8e0]" + s := fmt.Sprintf("%x", digs[0][:]) + if s != expected { + t.Errorf("digs[0] got %s", s) + } + s = fmt.Sprintf("%x", digs[1][:]) + if s != expected { + t.Errorf("digs[1] got %s", s) + } + s = fmt.Sprintf("%x", digs[2][:]) + if s != expected { + t.Errorf("digs[2] got %s", s) + } + s = fmt.Sprintf("%x", digs[3][:]) + if s != expected { + t.Errorf("digs[3] got %s", s) + } + + digs = initState4() + p = createTwoBlocksBy4() + blockMultBy4(&digs[0], &p[0], &buffer[0], 2) + expected = "[debe9ff9 2275b8a1 38604889 c18e5a4d 6fdb70e5 387e5765 293dcba3 9c0c5732]" + s = fmt.Sprintf("%x", digs[0][:]) + if s != expected { + t.Errorf("digs[0] got %s", s) + } + s = fmt.Sprintf("%x", digs[1][:]) + if s != expected { + t.Errorf("digs[1] got %s", s) + } + s = fmt.Sprintf("%x", digs[2][:]) + if s != expected { + t.Errorf("digs[2] got %s", s) + } + s = fmt.Sprintf("%x", digs[3][:]) + if s != expected { + t.Errorf("digs[3] got %s", s) + } +} + +func BenchmarkOneBlockBy4(b *testing.B) { + digs := initState4() + p := createOneBlockBy4() + buffer := make([]byte, 1216) + b.SetBytes(64 * 4) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + blockMultBy4(&digs[0], &p[0], &buffer[0], 1) + } +} + +func BenchmarkTwoBlocksBy4(b *testing.B) { + digs := initState4() + p := createTwoBlocksBy4() + buffer := make([]byte, 1216) + b.SetBytes(64 * 2 * 4) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + blockMultBy4(&digs[0], &p[0], &buffer[0], 2) + } +}