diff --git a/sm3/kdf_amd64.go b/sm3/kdf_amd64.go index 08af870..33d2a08 100644 --- a/sm3/kdf_amd64.go +++ b/sm3/kdf_amd64.go @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build !purego package sm3 diff --git a/sm3/kdf_arm64.go b/sm3/kdf_arm64.go index 83c51c1..dfd2349 100644 --- a/sm3/kdf_arm64.go +++ b/sm3/kdf_arm64.go @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build !purego package sm3 diff --git a/sm3/kdf_generic.go b/sm3/kdf_generic.go index 3f07bd5..6bad4b2 100644 --- a/sm3/kdf_generic.go +++ b/sm3/kdf_generic.go @@ -1,4 +1,8 @@ -//go:build purego || !(amd64 || arm64) +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build purego || !(amd64 || arm64 || s390x) package sm3 diff --git a/sm3/kdf_mult4_asm.go b/sm3/kdf_mult4_asm.go index 1ede296..9edc15c 100644 --- a/sm3/kdf_mult4_asm.go +++ b/sm3/kdf_mult4_asm.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || s390x) && !purego package sm3 diff --git a/sm3/kdf_mult8_amd64.go b/sm3/kdf_mult8_amd64.go index c91e7ac..1e2fd49 100644 --- a/sm3/kdf_mult8_amd64.go +++ b/sm3/kdf_mult8_amd64.go @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build !purego package sm3 diff --git a/sm3/sm3blocks_s390x.go b/sm3/kdf_s390x.go similarity index 54% rename from sm3/sm3blocks_s390x.go rename to sm3/kdf_s390x.go index 1a61e36..088e51c 100644 --- a/sm3/sm3blocks_s390x.go +++ b/sm3/kdf_s390x.go @@ -6,8 +6,10 @@ package sm3 -//go:noescape -func transposeMatrix(dig **[8]uint32) +func kdf(baseMD *digest, keyLen int, limit int) []byte { + if limit < 4 { + return kdfGeneric(baseMD, keyLen, limit) + } -//go:noescape -func copyResultsBy4(dig *uint32, p *byte) + return kdfBy4(baseMD, keyLen, limit) +} diff --git a/sm3/sm3blocks_arm64.s b/sm3/sm3blocks_arm64.s index 1babae3..9647598 100644 --- a/sm3/sm3blocks_arm64.s +++ b/sm3/sm3blocks_arm64.s @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build !purego #include "textflag.h" diff --git a/sm3/sm3blocks_avx2_amd64.s b/sm3/sm3blocks_avx2_amd64.s index a6f3342..1ccb7b7 100644 --- a/sm3/sm3blocks_avx2_amd64.s +++ b/sm3/sm3blocks_avx2_amd64.s @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build !purego #include "textflag.h" diff --git a/sm3/sm3blocks_avx2_test.go b/sm3/sm3blocks_avx2_test.go index 0c6676d..1e52d17 100644 --- a/sm3/sm3blocks_avx2_test.go +++ b/sm3/sm3blocks_avx2_test.go @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build amd64 && !purego package sm3 diff --git a/sm3/sm3blocks_s390x.s b/sm3/sm3blocks_s390x.s index e560774..9f459a8 100644 --- a/sm3/sm3blocks_s390x.s +++ b/sm3/sm3blocks_s390x.s @@ -17,6 +17,32 @@ DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f GLOBL mask<>(SB), 8, $64 +#define a V0 +#define e V1 +#define b V2 +#define f V3 +#define c V4 +#define g V5 +#define d V6 +#define h V7 +#define M0 V8 +#define M1 V9 +#define M2 V10 +#define M3 V11 +#define TMP0 V12 +#define TMP1 V13 +#define TMP2 V14 +#define TMP3 V15 +#define TMP4 V16 +#define aSave V24 +#define bSave V25 +#define cSave V26 +#define dSave V27 +#define eSave V28 +#define fSave V29 +#define gSave V30 +#define hSave V31 + #define TRANSPOSE_MATRIX(T0, T1, T2, T3, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) \ VPERM T0, T1, M0, TMP0; \ VPERM T2, T3, M0, TMP1; \ @@ -27,6 +53,115 @@ GLOBL mask<>(SB), 8, $64 VPERM TMP2, TMP3, M2, T2; \ VPERM TMP2, TMP3, M3, T3 +// r = s <<< n +#define PROLD(s, r, n) \ + VERLLF $n, s, r + +#define loadWordByIndex(W, start, i) \ + VL $(4*i)(start), W + +// one word is 16 bytes +#define prepare4Words \ + VL 0(srcPtr1)(srcPtrPtr), V16; \ + VL 0(srcPtr2)(srcPtrPtr), V17; \ + VL 0(srcPtr3)(srcPtrPtr), V18; \ + VL 0(srcPtr4)(srcPtrPtr), V19; \ + TRANSPOSE_MATRIX(V16, V17, V18, V19, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3); \ + VST V16, V19, (wordPtr); \ + LAY 16(srcPtrPtr), srcPtrPtr; \ + ADD $64, wordPtr + +#define LOAD_T(const, T) \ + VREPIF $const, T + +#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ + PROLD(a, TMP0, 12) \ + VLR TMP0, TMP1 \ + LOAD_T(const, TMP2) \ + VAF TMP2, TMP1, TMP1 \ + VAF e, TMP1, TMP1 \ + PROLD(TMP1, TMP2, 7) \ // TMP2 = SS1 + VX TMP2, TMP1, TMP0 \ // TMP0 = SS2 + VX a, b, TMP1 \ + VX c, TMP1, TMP1 \ + VAF TMP1, d, TMP1 \ // TMP1 = (a XOR b XOR c) + d + loadWordByIndex(TMP3, index) \ + loadWordByIndex(TMP4, index+4) \ + VX TMP3, TMP4, TMP4 \ + VAF TMP4, TMP1, TMP1 \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4) + VAF TMP1, TMP0, TMP1 \ // TMP1 = TT1 + VAF h, TMP3, TMP3 \ + VAF TMP3, TMP2, TMP3 \ // Wt + h + SS1 + VX e, f, TMP4 \ + VX g, TMP4, TMP4 \ + VAF TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 + VLR b, TMP4 \ + PROLD(TMP4, b, 9) \ // b = b <<< 9 + VLR TMP1, h \ // h = TT1 + VLR f, TMP4 \ + PROLD(TMP4, f, 19) \ // f = f <<< 19 + PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 + PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 + VX TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) + VX TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) + +#define MESSAGE_SCHEDULE(index) \ + loadWordByIndex(TMP0, index+1) \ // Wj-3 + PROLD(TMP0, TMP1, 15) \ + loadWordByIndex(TMP0, index-12) \ // Wj-16 + VX TMP0, TMP1, TMP0 \ + loadWordByIndex(TMP1, index-5) \ // Wj-9 + VX TMP0, TMP1, TMP0 \ + PROLD(TMP0, TMP1, 15) \ + PROLD(TMP1, TMP2, 8) \ + VX TMP1, TMP0, TMP0 \ + VX TMP2, TMP0, TMP0 \ // P1 + loadWordByIndex(TMP1, index-9) \ // Wj-13 + PROLD(TMP1, TMP2, 7) \ + VX TMP2, TMP0, TMP0 \ + loadWordByIndex(TMP1, index-2) \ // Wj-6 + VX TMP1, TMP0, TMP1 \ + VST TMP1, (wordPtr) \ + ADD $16, wordPtr \ + +#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \ + MESSAGE_SCHEDULE(index) \ + ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ + +#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ + MESSAGE_SCHEDULE(index) \ // V11 is Wt+4 now, Pls do not use it + PROLD(a, TMP0, 12) \ + VLR TMP0, TMP1 \ + LOAD_T(const, tmp1) \ + VAF tmp1, TMP0, TMP0 \ + VAF e, TMP0, TMP0 \ + PROLD(TMP0, TMP2, 7) \ // V14 = SS1 + VX TMP2, TMP1, TMP0 \ // TMP0 = SS2 + VO a, b, TMP3 \ + VN a, b, TMP1 \ + VN c, TMP3, TMP3 \ + VO TMP1, TMP3, TMP1 \ // (a AND b) OR (a AND c) OR (b AND c) + VAF TMP1, d, TMP1 \ // (a AND b) OR (a AND c) OR (b AND c) + d + loadWordByIndex(TMP3, index) \ // Wj + VX TMP3, TMP4, TMP4 \ // Wj XOR Wj+4 + VAF TMP1, TMP4, TMP1 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) + VAF TMP1, TMP0, TMP1 \ // TT1 + VAF h, TMP3, TMP3 \ // Wt + h + VAF TMP2, TMP3, TMP3 \ // Wt + h + SS1 + VX f, g, TMP4 \ + VN TMP4, e, TMP4 \ + VX g, TMP4, TMP4 \ // (f XOR g) AND e XOR g + VAF TMP3, TMP4, TMP3 \ // TT2 + VLR b, TMP4 \ + PROLD(TMP4, b, 9) \ // b = b <<< 9 + VLR TMP1, h \ // h = TT1 + VLR f, TMP4 \ + PROLD(TMP4, f, 19) \ // f = f <<< 19 + PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 + PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 + VX TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) + VX TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) + // transposeMatrix(dig **[8]uint32) TEXT ·transposeMatrix(SB),NOSPLIT,$0 MOVD dig+0(FP), R1 @@ -69,3 +204,155 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0 VSTM V0, V7, (dstPtr) RET + + +// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) +TEXT ·blockMultBy4(SB), NOSPLIT, $0 +#define digPtr R0 +#define srcPtrPtr R1 +#define statePtr R2 +#define blockCount R5 +#define srcPtr1 R6 +#define srcPtr2 R7 +#define srcPtr3 R8 +#define srcPtr4 R9 +#define wordPtr R10 + MOVD dig+0(FP), digPtr + MOVD p+8(FP), srcPtrPtr + MOVD buffer+16(FP), statePtr + MOVD blocks+24(FP), blockCount + + // load state + MOVD 0(digPtr), R4 + VLM (R4), a, e + MOVD 8(digPtr), R4 + VLM (R4), b, f + MOVD 16(digPtr), R4 + VLM (R4), c, g + MOVD 24(digPtr), R4 + VLM (R4), d, h + + MOVD $mask<>+0x00(SB), R4 + VLM (R4), M0, M3 + + TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) + TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) + + MOVD (srcPtrPtr), srcPtr1 + MOVD 8(srcPtrPtr), srcPtr2 + MOVD 16(srcPtrPtr), srcPtr3 + MOVD 24(srcPtrPtr), srcPtr4 + MOVD $0, srcPtrPtr + +loop: + // save state + VLR a, aSave + VLR b, bSave + VLR c, cSave + VLR d, dSave + VLR e, eSave + VLR f, fSave + VLR g, gSave + VLR h, hSave + + // reset wordPtr + MOVD statePtr, wordPtr + + // load message block + prepare4Words + prepare4Words + prepare4Words + prepare4Words + + ROUND_00_11(0, T0, a, b, c, d, e, f, g, h) + ROUND_00_11(1, T1, h, a, b, c, d, e, f, g) + ROUND_00_11(2, T2, g, h, a, b, c, d, e, f) + ROUND_00_11(3, T3, f, g, h, a, b, c, d, e) + ROUND_00_11(4, T4, e, f, g, h, a, b, c, d) + ROUND_00_11(5, T5, d, e, f, g, h, a, b, c) + ROUND_00_11(6, T6, c, d, e, f, g, h, a, b) + ROUND_00_11(7, T7, b, c, d, e, f, g, h, a) + ROUND_00_11(8, T8, a, b, c, d, e, f, g, h) + ROUND_00_11(9, T9, h, a, b, c, d, e, f, g) + ROUND_00_11(10, T10, g, h, a, b, c, d, e, f) + ROUND_00_11(11, T11, f, g, h, a, b, c, d, e) + + ROUND_12_15(12, T12, e, f, g, h, a, b, c, d) + ROUND_12_15(13, T13, d, e, f, g, h, a, b, c) + ROUND_12_15(14, T14, c, d, e, f, g, h, a, b) + ROUND_12_15(15, T15, b, c, d, e, f, g, h, a) + + ROUND_16_63(16, T16, a, b, c, d, e, f, g, h) + ROUND_16_63(17, T17, h, a, b, c, d, e, f, g) + ROUND_16_63(18, T18, g, h, a, b, c, d, e, f) + ROUND_16_63(19, T19, f, g, h, a, b, c, d, e) + ROUND_16_63(20, T20, e, f, g, h, a, b, c, d) + ROUND_16_63(21, T21, d, e, f, g, h, a, b, c) + ROUND_16_63(22, T22, c, d, e, f, g, h, a, b) + ROUND_16_63(23, T23, b, c, d, e, f, g, h, a) + ROUND_16_63(24, T24, a, b, c, d, e, f, g, h) + ROUND_16_63(25, T25, h, a, b, c, d, e, f, g) + ROUND_16_63(26, T26, g, h, a, b, c, d, e, f) + ROUND_16_63(27, T27, f, g, h, a, b, c, d, e) + ROUND_16_63(28, T28, e, f, g, h, a, b, c, d) + ROUND_16_63(29, T29, d, e, f, g, h, a, b, c) + ROUND_16_63(30, T30, c, d, e, f, g, h, a, b) + ROUND_16_63(31, T31, b, c, d, e, f, g, h, a) + ROUND_16_63(32, T32, a, b, c, d, e, f, g, h) + ROUND_16_63(33, T33, h, a, b, c, d, e, f, g) + ROUND_16_63(34, T34, g, h, a, b, c, d, e, f) + ROUND_16_63(35, T35, f, g, h, a, b, c, d, e) + ROUND_16_63(36, T36, e, f, g, h, a, b, c, d) + ROUND_16_63(37, T37, d, e, f, g, h, a, b, c) + ROUND_16_63(38, T38, c, d, e, f, g, h, a, b) + ROUND_16_63(39, T39, b, c, d, e, f, g, h, a) + ROUND_16_63(40, T40, a, b, c, d, e, f, g, h) + ROUND_16_63(41, T41, h, a, b, c, d, e, f, g) + ROUND_16_63(42, T42, g, h, a, b, c, d, e, f) + ROUND_16_63(43, T43, f, g, h, a, b, c, d, e) + ROUND_16_63(44, T44, e, f, g, h, a, b, c, d) + ROUND_16_63(45, T45, d, e, f, g, h, a, b, c) + ROUND_16_63(46, T46, c, d, e, f, g, h, a, b) + ROUND_16_63(47, T47, b, c, d, e, f, g, h, a) + ROUND_16_63(48, T16, a, b, c, d, e, f, g, h) + ROUND_16_63(49, T17, h, a, b, c, d, e, f, g) + ROUND_16_63(50, T18, g, h, a, b, c, d, e, f) + ROUND_16_63(51, T19, f, g, h, a, b, c, d, e) + ROUND_16_63(52, T20, e, f, g, h, a, b, c, d) + ROUND_16_63(53, T21, d, e, f, g, h, a, b, c) + ROUND_16_63(54, T22, c, d, e, f, g, h, a, b) + ROUND_16_63(55, T23, b, c, d, e, f, g, h, a) + ROUND_16_63(56, T24, a, b, c, d, e, f, g, h) + ROUND_16_63(57, T25, h, a, b, c, d, e, f, g) + ROUND_16_63(58, T26, g, h, a, b, c, d, e, f) + ROUND_16_63(59, T27, f, g, h, a, b, c, d, e) + ROUND_16_63(60, T28, e, f, g, h, a, b, c, d) + ROUND_16_63(61, T29, d, e, f, g, h, a, b, c) + ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) + ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) + + VX a, aSave, a + VX b, bSave, b + VX c, cSave, c + VX d, dSave, d + VX e, eSave, e + VX f, fSave, f + VX g, gSave, g + VX h, hSave, h + + SUB $1, blockCount + CMPBGT blockCount, $0, loop + + TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) + TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) + + MOVD 0(digPtr), R4 + VSTM a, e, (R4) + MOVD 8(digPtr), R4 + VSTM b, f, (R4) + MOVD 16(digPtr), R4 + VSTM c, g, (R4) + MOVD 24(digPtr), R4 + VSTM d, h, (R4) + + RET diff --git a/sm3/sm3blocks_s390x_test.go b/sm3/sm3blocks_s390x_test.go deleted file mode 100644 index 1467ab3..0000000 --- a/sm3/sm3blocks_s390x_test.go +++ /dev/null @@ -1,54 +0,0 @@ -//go:build s390x && !purego - -package sm3 - -import ( - "fmt" - "testing" -) - -func TestTransposeMatrix(t *testing.T) { - var m [4][8]uint32 - var k uint32 = 0 - for i := 0; i < 4; i++ { - for j := 0; j < 8; j++ { - m[i][j] = k - k++ - fmt.Printf("%04x ", m[i][j]) - } - fmt.Println() - } - input := [4]*[8]uint32{&m[0], &m[1], &m[2], &m[3]} - transposeMatrix(&input[0]) - fmt.Println() - fmt.Println() - for i := 0; i < 4; i++ { - for j := 0; j < 8; j++ { - fmt.Printf("%04x ", m[i][j]) - } - fmt.Println() - } -} - -func TestCopyResultsBy4(t *testing.T) { - var m [4][8]uint32 - var k uint32 = 0 - for i := 0; i < 4; i++ { - for j := 0; j < 8; j++ { - m[i][j] = k << 24 - k++ - fmt.Printf("%04x ", m[i][j]) - } - fmt.Println() - } - var p [128]byte - copyResultsBy4(&m[0][0], &p[0]) - fmt.Println() - fmt.Println() - for i := 0; i < 128; i++ { - fmt.Printf("%02x ", p[i]) - if i%16 == 15 { - fmt.Println() - } - } -} diff --git a/sm3/sm3blocks_simd_amd64.s b/sm3/sm3blocks_simd_amd64.s index dd8339f..5090353 100644 --- a/sm3/sm3blocks_simd_amd64.s +++ b/sm3/sm3blocks_simd_amd64.s @@ -1,3 +1,7 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + //go:build !purego #include "textflag.h" diff --git a/sm3/sm3blocks_test.go b/sm3/sm3blocks_test.go index 4e97972..c1d8ee8 100644 --- a/sm3/sm3blocks_test.go +++ b/sm3/sm3blocks_test.go @@ -1,4 +1,8 @@ -//go:build (amd64 || arm64) && !purego +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build (amd64 || arm64 || s390x) && !purego package sm3