kdf-sm3: mult by 4 #221

This commit is contained in:
Sun Yimin 2024-05-21 09:58:38 +08:00 committed by GitHub
parent 9ef3fdc7d5
commit 97d28520b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 1297 additions and 4 deletions

11
sm3/kdf_amd64.go Normal file
View File

@ -0,0 +1,11 @@
//go:build !purego
package sm3
func kdf(baseMD *digest, keyLen int, limit int) []byte {
if limit < 4 {
return kdfGeneric(baseMD, keyLen, limit)
}
return kdfBy4(baseMD, keyLen, limit)
}

10
sm3/kdf_arm64.go Normal file
View File

@ -0,0 +1,10 @@
//go:build !purego
package sm3
func kdf(baseMD *digest, keyLen int, limit int) []byte {
if useSM3NI || limit < 4 {
return kdfGeneric(baseMD, keyLen, limit)
}
return kdfBy4(baseMD, keyLen, limit)
}

7
sm3/kdf_generic.go Normal file
View File

@ -0,0 +1,7 @@
//go:build purego || !(amd64 || arm64)
package sm3
func kdf(baseMD *digest, keyLen int, limit int) []byte {
return kdfGeneric(baseMD, keyLen, limit)
}

97
sm3/kdf_mult_asm.go Normal file
View File

@ -0,0 +1,97 @@
//go:build (amd64 || arm64) && !purego
package sm3
import "encoding/binary"
func prepareData(baseMD *digest, p []byte, ct uint32, len, t uint64) {
if baseMD.nx > 0 {
copy(p, baseMD.x[:baseMD.nx])
}
binary.BigEndian.PutUint32(p[baseMD.nx:], ct)
// Padding. Add a 1 bit and 0 bits until 56 bytes mod 64.
var tmp [64 + 8]byte // padding + length buffer
tmp[0] = 0x80
padlen := tmp[:t+8]
binary.BigEndian.PutUint64(padlen[t:], len)
copy(p[baseMD.nx+4:], padlen)
}
func copyResult(result []byte, dig *[8]uint32) {
binary.BigEndian.PutUint32(result[0:], dig[0])
binary.BigEndian.PutUint32(result[4:], dig[1])
binary.BigEndian.PutUint32(result[8:], dig[2])
binary.BigEndian.PutUint32(result[12:], dig[3])
binary.BigEndian.PutUint32(result[16:], dig[4])
binary.BigEndian.PutUint32(result[20:], dig[5])
binary.BigEndian.PutUint32(result[24:], dig[6])
binary.BigEndian.PutUint32(result[28:], dig[7])
}
// 1216 = 68 * 4 * 4 + 8 * 4 * 4 = 76 * 16
// 64 * 2 * 4 = 512
const preallocSize = 1728
func kdfBy4(baseMD *digest, keyLen int, limit int) []byte {
if limit < 4 {
return kdfGeneric(baseMD, keyLen, limit)
}
var t uint64
blocks := 1
len := baseMD.len + 4
remainlen := len % 64
if remainlen < 56 {
t = 56 - remainlen
} else {
t = 64 + 56 - remainlen
blocks = 2
}
len <<= 3
// prepare temporary buffer
tmpStart := 4 * blocks * BlockSize
buffer := make([]byte, preallocSize)
tmp := buffer[tmpStart:]
// prepare processing data
var data [4]*byte
var digs [4]*[8]uint32
var states [4][8]uint32
for j := 0; j < 4; j++ {
digs[j] = &states[j]
}
var ct uint32 = 1
k := make([]byte, keyLen)
ret := k
times := limit / 4
for i := 0; i < times; i++ {
for j := 0; j < 4; j++ {
// prepare states
states[j] = baseMD.h
// prepare data
p := buffer[blocks*BlockSize*j:]
data[j] = &p[0]
prepareData(baseMD, p, ct, len, t)
ct++
}
blockMultBy4(&digs[0], &data[0], &tmp[0], blocks)
for j := 0; j < 4; j++ {
copyResult(ret, digs[j])
ret = ret[Size:]
}
}
remain := limit % 4
for i := 0; i < remain; i++ {
binary.BigEndian.PutUint32(tmp[:], ct)
md := *baseMD
md.Write(tmp[:4])
h := md.checkSum()
copy(ret[i*Size:], h[:])
ct++
}
return k
}
//go:noescape
func blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)

View File

@ -218,12 +218,16 @@ func (baseMD *digest) Kdf(z []byte, keyLen int) []byte {
if limit >= uint64(1<<32)-1 {
panic("sm3: key length too long")
}
baseMD.Reset()
baseMD.Write(z)
return kdf(baseMD, keyLen, int(limit))
}
func kdfGeneric(baseMD *digest, keyLen int, limit int) []byte {
var countBytes [4]byte
var ct uint32 = 1
k := make([]byte, keyLen)
baseMD.Reset()
baseMD.Write(z)
for i := 0; i < int(limit); i++ {
for i := 0; i < limit; i++ {
binary.BigEndian.PutUint32(countBytes[:], ct)
md := *baseMD
md.Write(countBytes[:])

View File

@ -420,12 +420,14 @@ func TestKdf(t *testing.T) {
{"sm3 case 3", args{[]byte("emmansun"), 48}, "708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"},
{"sm3 case 4", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 48}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f"},
{"sm3 case 5", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 128}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb"},
{"sm3 case 6", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 159}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e8"},
{"sm3 case 7", args{[]byte("708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493708993ef1388a0ae4245a19bb6c02554c632633e356ddb989beb804fda96cfd47eba4fa460e7b277bc6b4ce4d07ed493"), 300}, "49cf14649f324a07e0d5bb2a00f7f05d5f5bdd6d14dff028e071327ec031104590eddb18f98b763e18bf382ff7c3875f30277f3179baebd795e7853fa643fdf280d8d7b81a2ab7829f615e132ab376d32194cd315908d27090e1180ce442d9be99322523db5bfac40ac5acb03550f5c93e5b01b1d71f2630868909a6a1250edb9abb2c6b0673e349f64c6577d4ba1b0a9c360016bae9478f8a80d5426327e84ea915c10ef39a016618b00aaae8735a8a1405180746ddd7ccd05dc890c5e5d07f49c40afdbc09267859ac5967b8c1163dc6defab955604e45e349a51df11d81b298424b84472607249a05b481ae88d98a9273ecdee009add0619641bd7d9f0b13a502e36e67b5836d0480a518a01046fa2738698fbe5e5008de11704b45531532667896158158ea08847a55a9"},
}
for _, tt := range tests {
wantBytes, _ := hex.DecodeString(tt.want)
t.Run(tt.name, func(t *testing.T) {
if got := Kdf(tt.args.z, tt.args.len); !reflect.DeepEqual(got, wantBytes) {
t.Errorf("Kdf(%v) = %x, want %v", tt.name, got, tt.want)
t.Errorf("Kdf(%v,kLen=%v,zLen=%v) = %x, want %v", tt.name, tt.args.len, len(tt.args.z,), got, tt.want)
}
})
}

309
sm3/sm3blocks_arm64.s Normal file
View File

@ -0,0 +1,309 @@
//go:build !purego
#include "textflag.h"
#include "sm3_const_asm.s"
#define a V0
#define b V1
#define c V2
#define d V3
#define e V4
#define f V5
#define g V6
#define h V7
#define tmp1 V8
#define tmp2 V9
#define tmp3 V10
#define tmp4 V11
// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
// output: from high to low
// t0 = t3.S0, t2.S0, t1.S0, t0.S0
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
VZIP1 t1.S4, t0.S4, RTMP0.S4 \
VZIP1 t3.S4, t2.S4, RTMP1.S4 \
VZIP2 t1.S4, t0.S4, RTMP2.S4 \
VZIP2 t3.S4, t2.S4, RTMP3.S4 \
VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \
VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \
VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \
VZIP2 RTMP3.D2, RTMP2.D2, t3.D2
// d = s <<< n
#define PROLD(s, d, n) \
VSHL $(n), s.S4, d.S4
VSRI $(32-n), s.S4, d.S4
#define loadWordByIndex(W, i) \
ADD $(16*i), wordStart, R20
VLD1 (R20), [W.S4]
#define prepare4Words \
VLD1.P 16(srcPtr1), [V12.B16] \
VLD1.P 16(srcPtr2), [V13.B16] \
VLD1.P 16(srcPtr3), [V14.B16] \
VLD1.P 16(srcPtr4), [V15.B16] \
TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \
VREV32 V12, V12; \
VREV32 V13, V13; \
VREV32 V14, V14; \
VREV32 V15, V15; \
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr)
#define LOAD_T(const, T) \
MOVD $const, R20 \
VDUP R20, T.S4 \
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
PROLD(a, V12, 12) \
VMOV V12.B16, V13.B16 \
LOAD_T(const, tmp1) \
VADD tmp1.S4, V12.S4, V12.S4 \
VADD e.S4, V12.S4, V12.S4 \
PROLD(V12, V14, 7) \ // V14 = SS1
VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2
VEOR a.B16, b.B16, V13.B16 \
VEOR c.B16, V13.B16, V13.B16 \
VADD V13.S4, d.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d
loadWordByIndex(V10, index) \
loadWordByIndex(V11, index+4) \
VEOR V10.B16, V11.B16, V11.B16 \
VADD V11.S4, V13.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
VADD V13.S4, V12.S4, V13.S4 \ // TT1
VADD h.S4, V10.S4, V10.S4 \
VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1
VEOR e.B16, f.B16, V11.B16 \
VEOR g.B16, V11.B16, V11.B16 \
VADD V11.S4, V10.S4, V10.S4 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
VMOV b.B16, V11.B16 \
PROLD(V11, b, 9) \ // b = b <<< 9
VMOV V13.B16, h.B16 \ // h = TT1
VMOV f.B16, V11.B16 \
PROLD(V11, f, 19) \ // f = f <<< 19
PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9
PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17
VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)
VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
#define MESSAGE_SCHEDULE(index) \
loadWordByIndex(V10, index+1) \ // Wj-3
PROLD(V10, V11, 15) \
loadWordByIndex(V10, index-12) \ // Wj-16
VEOR V10.B16, V11.B16, V10.B16 \
loadWordByIndex(V11, index-5) \ // Wj-9
VEOR V10.B16, V11.B16, V10.B16 \
PROLD(V10, V11, 15) \
PROLD(V11, V12, 8) \
VEOR V11.B16, V10.B16, V10.B16 \
VEOR V12.B16, V10.B16, V10.B16 \ // P1
loadWordByIndex(V11, index-9) \ // Wj-13
PROLD(V11, V12, 7) \
VEOR V12.B16, V10.B16, V10.B16 \
loadWordByIndex(V11, index-2) \ // Wj-6
VEOR V11.B16, V10.B16, V11.B16 \
VST1.P V11.S4, 16(wordPtr) \
#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index) \
ROUND_00_11(index, a, b, c, d, e, f, g, h) \
#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ // V11 is Wt+4 now, Pls do not use it
PROLD(a, V12, 12) \
VMOV V12.B16, V13.B16 \
LOAD_T(const, tmp1) \
VADD tmp1.S4, V12.S4, V12.S4 \
VADD e.S4, V12.S4, V12.S4 \
PROLD(V12, V14, 7) \ // V14 = SS1
VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2
VORR a.B16, b.B16, V10.B16 \
VAND a.B16, b.B16, V13.B16 \
VAND c.B16, V10.B16, V10.B16 \
VORR V13.B16, V10.B16, V13.B16 \ // (a AND b) OR (a AND c) OR (b AND c)
VADD V13.S4, d.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d
loadWordByIndex(V10, index) \ // Wj
VEOR V10.B16, V11.B16, V11.B16 \ // Wj XOR Wj+4
VADD V13.S4, V11.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
VADD V13.S4, V12.S4, V13.S4 \ // TT1
VADD h.S4, V10.S4, V10.S4 \ // Wt + h
VADD V12.S4, V10.S4, V10.S4 \ // Wt + h + SS1
VEOR f.B16, g.B16, V11.B16 \
VAND V11.B16, e.B16, V11.B16 \
VEOR g.B16, v11.B16, V11.B16 \ // (f XOR g) AND e XOR g
VADD V14.S4, V11.S4, V10.S4 \ // TT2
VMOV b.B16, V11.B16 \
PROLD(V11, b, 9) \ // b = b <<< 9
VMOV V13.B16, h.B16 \ // h = TT1
VMOV f.B16, V11.B16 \
PROLD(V11, f, 19) \ // f = f <<< 19
PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9
PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17
VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)
VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
// func blockMultBy4(dig *digest, p []byte)
TEXT ·blockMultBy4(SB), NOSPLIT, $0
#define digPtr R0
#define srcPtrPtr R1
#define statePtr R2
#define blockCount R3
#define digSave R4
#define wordStart R5
#define srcPtr1 R6
#define srcPtr2 R7
#define srcPtr3 R8
#define srcPtr4 R9
#define wordPtr R10
MOVD dig+0(FP), digPtr
MOVD p+8(FP), srcPtrPtr
MOVD buffer+16(FP), statePtr
MOVD blocks+24(FP), blockCount
// load state
MOVD digPtr, digSave
MOVD.P 8(digPtr), R20
VLD1.P 16(R20), [a.S4]
VLD1 (R20), [e.S4]
MOVD.P 8(digPtr), R20
VLD1.P 16(R20), [b.S4]
VLD1 (R20), [f.S4]
MOVD.P 8(digPtr), R20
VLD1.P 16(R20), [c.S4]
VLD1 (R20), [g.S4]
MOVD (digPtr), R20
VLD1.P 16(R20), [d.S4]
VLD1 (R20), [h.S4]
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
// store state to temporary buffer
MOVD statePtr, wordStart
VST1.P [a.S4, b.S4, c.S4, d.S4], 64(wordStart)
VST1.P [e.S4, f.S4, g.S4, h.S4], 64(wordStart)
MOVD wordStart, wordPtr
MOVD.P 8(srcPtrPtr), srcPtr1
MOVD.P 8(srcPtrPtr), srcPtr2
MOVD.P 8(srcPtrPtr), srcPtr3
MOVD (srcPtrPtr), srcPtr4
loop:
// load message block
prepare4Words
prepare4Words
prepare4Words
prepare4Words
ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
MOVD statePtr, R20
VLD1.P 64(R20), [V8.S4, V9.S4, V10.S4, V11.S4]
VLD1 (R20), [V12.S4, V13.S4, V14.S4, V15.S4]
VEOR a.B16, V8.B16, a.B16
VEOR b.B16, V9.B16, b.B16
VEOR c.B16, V10.B16, c.B16
VEOR d.B16, V11.B16, d.B16
VEOR e.B16, V12.B16, e.B16
VEOR f.B16, V13.B16, f.B16
VEOR g.B16, V14.B16, g.B16
VEOR h.B16, V15.B16, h.B16
MOVD statePtr, R20
VST1.P [a.S4, b.S4, c.S4, d.S4], 64(R20)
VST1 [e.S4, f.S4, g.S4, h.S4], (R20)
SUB $1, blockCount
CBNZ blockCount, loop
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
MOVD.P 8(digSave), R20
VST1.P [a.S4], 16(R20)
VST1 [e.S4], (R20)
MOVD.P 8(digSave), R20
VST1.P [b.S4], 16(R20)
VST1 [f.S4], (R20)
MOVD.P 8(digSave), R20
VST1.P [c.S4], 16(R20)
VST1 [g.S4], (R20)
MOVD (digSave), R20
VST1.P [d.S4], 16(R20)
VST1 [h.S4], (R20)
RET

736
sm3/sm3blocks_simd_amd64.s Normal file
View File

@ -0,0 +1,736 @@
//go:build !purego
#include "textflag.h"
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), RODATA, $16
DATA T256_4<>+0x00(SB)/8, $0x79cc451979cc4519
DATA T256_4<>+0x08(SB)/8, $0x79cc451979cc4519
DATA T256_4<>+0x10(SB)/8, $0xf3988a32f3988a32
DATA T256_4<>+0x18(SB)/8, $0xf3988a32f3988a32
DATA T256_4<>+0x20(SB)/8, $0xe7311465e7311465
DATA T256_4<>+0x28(SB)/8, $0xe7311465e7311465
DATA T256_4<>+0x30(SB)/8, $0xce6228cbce6228cb
DATA T256_4<>+0x38(SB)/8, $0xce6228cbce6228cb
DATA T256_4<>+0x40(SB)/8, $0x9cc451979cc45197
DATA T256_4<>+0x48(SB)/8, $0x9cc451979cc45197
DATA T256_4<>+0x50(SB)/8, $0x3988a32f3988a32f
DATA T256_4<>+0x58(SB)/8, $0x3988a32f3988a32f
DATA T256_4<>+0x60(SB)/8, $0x7311465e7311465e
DATA T256_4<>+0x68(SB)/8, $0x7311465e7311465e
DATA T256_4<>+0x70(SB)/8, $0xe6228cbce6228cbc
DATA T256_4<>+0x78(SB)/8, $0xe6228cbce6228cbc
DATA T256_4<>+0x80(SB)/8, $0xcc451979cc451979
DATA T256_4<>+0x88(SB)/8, $0xcc451979cc451979
DATA T256_4<>+0x90(SB)/8, $0x988a32f3988a32f3
DATA T256_4<>+0x98(SB)/8, $0x988a32f3988a32f3
DATA T256_4<>+0xa0(SB)/8, $0x311465e7311465e7
DATA T256_4<>+0xa8(SB)/8, $0x311465e7311465e7
DATA T256_4<>+0xb0(SB)/8, $0x6228cbce6228cbce
DATA T256_4<>+0xb8(SB)/8, $0x6228cbce6228cbce
DATA T256_4<>+0xc0(SB)/8, $0xc451979cc451979c
DATA T256_4<>+0xc8(SB)/8, $0xc451979cc451979c
DATA T256_4<>+0xd0(SB)/8, $0x88a32f3988a32f39
DATA T256_4<>+0xd8(SB)/8, $0x88a32f3988a32f39
DATA T256_4<>+0xe0(SB)/8, $0x11465e7311465e73
DATA T256_4<>+0xe8(SB)/8, $0x11465e7311465e73
DATA T256_4<>+0xf0(SB)/8, $0x228cbce6228cbce6
DATA T256_4<>+0xf8(SB)/8, $0x228cbce6228cbce6
DATA T256_4<>+0x0100(SB)/8, $0x9d8a7a879d8a7a87
DATA T256_4<>+0x0108(SB)/8, $0x9d8a7a879d8a7a87
DATA T256_4<>+0x0110(SB)/8, $0x3b14f50f3b14f50f
DATA T256_4<>+0x0118(SB)/8, $0x3b14f50f3b14f50f
DATA T256_4<>+0x0120(SB)/8, $0x7629ea1e7629ea1e
DATA T256_4<>+0x0128(SB)/8, $0x7629ea1e7629ea1e
DATA T256_4<>+0x0130(SB)/8, $0xec53d43cec53d43c
DATA T256_4<>+0x0138(SB)/8, $0xec53d43cec53d43c
DATA T256_4<>+0x0140(SB)/8, $0xd8a7a879d8a7a879
DATA T256_4<>+0x0148(SB)/8, $0xd8a7a879d8a7a879
DATA T256_4<>+0x0150(SB)/8, $0xb14f50f3b14f50f3
DATA T256_4<>+0x0158(SB)/8, $0xb14f50f3b14f50f3
DATA T256_4<>+0x0160(SB)/8, $0x629ea1e7629ea1e7
DATA T256_4<>+0x0168(SB)/8, $0x629ea1e7629ea1e7
DATA T256_4<>+0x0170(SB)/8, $0xc53d43cec53d43ce
DATA T256_4<>+0x0178(SB)/8, $0xc53d43cec53d43ce
DATA T256_4<>+0x0180(SB)/8, $0x8a7a879d8a7a879d
DATA T256_4<>+0x0188(SB)/8, $0x8a7a879d8a7a879d
DATA T256_4<>+0x0190(SB)/8, $0x14f50f3b14f50f3b
DATA T256_4<>+0x0198(SB)/8, $0x14f50f3b14f50f3b
DATA T256_4<>+0x01a0(SB)/8, $0x29ea1e7629ea1e76
DATA T256_4<>+0x01a8(SB)/8, $0x29ea1e7629ea1e76
DATA T256_4<>+0x01b0(SB)/8, $0x53d43cec53d43cec
DATA T256_4<>+0x01b8(SB)/8, $0x53d43cec53d43cec
DATA T256_4<>+0x01c0(SB)/8, $0xa7a879d8a7a879d8
DATA T256_4<>+0x01c8(SB)/8, $0xa7a879d8a7a879d8
DATA T256_4<>+0x01d0(SB)/8, $0x4f50f3b14f50f3b1
DATA T256_4<>+0x01d8(SB)/8, $0x4f50f3b14f50f3b1
DATA T256_4<>+0x01e0(SB)/8, $0x9ea1e7629ea1e762
DATA T256_4<>+0x01e8(SB)/8, $0x9ea1e7629ea1e762
DATA T256_4<>+0x01f0(SB)/8, $0x3d43cec53d43cec5
DATA T256_4<>+0x01f8(SB)/8, $0x3d43cec53d43cec5
DATA T256_4<>+0x0200(SB)/8, $0x7a879d8a7a879d8a
DATA T256_4<>+0x0208(SB)/8, $0x7a879d8a7a879d8a
DATA T256_4<>+0x0210(SB)/8, $0xf50f3b14f50f3b14
DATA T256_4<>+0x0218(SB)/8, $0xf50f3b14f50f3b14
DATA T256_4<>+0x0220(SB)/8, $0xea1e7629ea1e7629
DATA T256_4<>+0x0228(SB)/8, $0xea1e7629ea1e7629
DATA T256_4<>+0x0230(SB)/8, $0xd43cec53d43cec53
DATA T256_4<>+0x0238(SB)/8, $0xd43cec53d43cec53
DATA T256_4<>+0x0240(SB)/8, $0xa879d8a7a879d8a7
DATA T256_4<>+0x0248(SB)/8, $0xa879d8a7a879d8a7
DATA T256_4<>+0x0250(SB)/8, $0x50f3b14f50f3b14f
DATA T256_4<>+0x0258(SB)/8, $0x50f3b14f50f3b14f
DATA T256_4<>+0x0260(SB)/8, $0xa1e7629ea1e7629e
DATA T256_4<>+0x0268(SB)/8, $0xa1e7629ea1e7629e
DATA T256_4<>+0x0270(SB)/8, $0x43cec53d43cec53d
DATA T256_4<>+0x0278(SB)/8, $0x43cec53d43cec53d
DATA T256_4<>+0x0280(SB)/8, $0x879d8a7a879d8a7a
DATA T256_4<>+0x0288(SB)/8, $0x879d8a7a879d8a7a
DATA T256_4<>+0x0290(SB)/8, $0x0f3b14f50f3b14f5
DATA T256_4<>+0x0298(SB)/8, $0x0f3b14f50f3b14f5
DATA T256_4<>+0x02a0(SB)/8, $0x1e7629ea1e7629ea
DATA T256_4<>+0x02a8(SB)/8, $0x1e7629ea1e7629ea
DATA T256_4<>+0x02b0(SB)/8, $0x3cec53d43cec53d4
DATA T256_4<>+0x02b8(SB)/8, $0x3cec53d43cec53d4
DATA T256_4<>+0x02c0(SB)/8, $0x79d8a7a879d8a7a8
DATA T256_4<>+0x02c8(SB)/8, $0x79d8a7a879d8a7a8
DATA T256_4<>+0x02d0(SB)/8, $0xf3b14f50f3b14f50
DATA T256_4<>+0x02d8(SB)/8, $0xf3b14f50f3b14f50
DATA T256_4<>+0x02e0(SB)/8, $0xe7629ea1e7629ea1
DATA T256_4<>+0x02e8(SB)/8, $0xe7629ea1e7629ea1
DATA T256_4<>+0x02f0(SB)/8, $0xcec53d43cec53d43
DATA T256_4<>+0x02f8(SB)/8, $0xcec53d43cec53d43
GLOBL T256_4<>(SB), RODATA, $3072 // 48 * 4 * 16
// Transpose matrix without PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions, bad performance!
// input: from high to low
// r0 = [w3, w2, w1, w0]
// r1 = [w7, w6, w5, w4]
// r2 = [w11, w10, w9, w8]
// r3 = [w15, w14, w13, w12]
// r: 32/64 temp register
// tmp1: 128 bits temp register
// tmp2: 128 bits temp register
//
// output: from high to low
// r0 = [w12, w8, w4, w0]
// r1 = [w13, w9, w5, w1]
// r2 = [w14, w10, w6, w2]
// r3 = [w15, w11, w7, w3]
//
// SSE2/MMX instructions:
// MOVOU r0, tmp2;
// PUNPCKHDQ r1, tmp2;
// PUNPCKLDQ r1, r0;
// MOVOU r2, tmp1;
// PUNPCKLDQ r3, tmp1;
// PUNPCKHDQ r3, r2;
// MOVOU r0, r1;
// PUNPCKHQDQ tmp1, r1;
// PUNPCKLQDQ tmp1, r0;
// MOVOU tmp2, r3;
// PUNPCKHQDQ r2, r3;
// PUNPCKLQDQ r2, tmp2;
// MOVOU tmp2, r2
#define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
MOVOU r0, tmp2; \
PUNPCKHLQ r1, tmp2; \
PUNPCKLLQ r1, r0; \
MOVOU r2, tmp1; \
PUNPCKLLQ r3, tmp1; \
PUNPCKHLQ r3, r2; \
MOVOU r0, r1; \
PUNPCKHQDQ tmp1, r1; \
PUNPCKLQDQ tmp1, r0; \
MOVOU tmp2, r3; \
PUNPCKHQDQ r2, r3; \
PUNPCKLQDQ r2, tmp2; \
MOVOU tmp2, r2
#define a X0
#define b X1
#define c X2
#define d X3
#define e X4
#define f X5
#define g X6
#define h X7
#define tmp1 X8
#define tmp2 X9
#define storeState \
MOVOU a, (BX) \
MOVOU b, 16(BX) \
MOVOU c, 32(BX) \
MOVOU d, 48(BX) \
MOVOU e, 64(BX) \
MOVOU f, 80(BX) \
MOVOU g, 96(BX) \
MOVOU h, 112(BX)
// xorm (mem), reg
// Xor reg to mem using reg-mem xor and store
#define xorm(P1, P2) \
MOVOU P1, tmp1; \
PXOR tmp1, P2; \
MOVOU P2, P1
#define store4Words(W, j) MOVOU W, (128+(j)*16)(BX)
#define load4Words(W, i) MOVOU (128+(i)*16)(BX), W
#define prepareFirst16Words(i) \
MOVOU (i*16)(R8), X10; \
MOVOU (i*16)(R9), X11; \
MOVOU (i*16)(R10), X12; \
MOVOU (i*16)(R11), X13; \
; \
SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
MOVOU flip_mask<>(SB), tmp1; \
PSHUFB tmp1, X10; \
PSHUFB tmp1, X11; \
PSHUFB tmp1, X12; \
PSHUFB tmp1, X13; \
; \
store4Words(X10, 4*i+0); \
store4Words(X11, 4*i+1); \
store4Words(X12, 4*i+2); \
store4Words(X13, 4*i+3)
// r <<< n, SSE version
#define PROLD(r, n) \
MOVOU r, tmp1; \
PSLLL $n, r; \
PSRLL $(32-n), tmp1; \
POR tmp1, r
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
MOVOU a, X12; \
PROLD(X12, 12); \
MOVOU X12, X13; \ // a <<< 12
MOVOU (index*16)(AX), tmp2; \
PADDL tmp2, X12; \
PADDL e, X12; \
PROLD(X12, 7); \ // SS1
PXOR X12, X13; \ // SS2
MOVOU b, X14; \
PXOR a, X14; \
PXOR c, X14; \ // (a XOR b XOR c)
PADDL d, X14; \ // (a XOR b XOR c) + d
load4Words(X10, index); \
load4Words(X11, index+4); \
PXOR X10, X11; \ //Wt XOR Wt+4
PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
PADDL X14, X13; \ // TT1
PADDL h, X10; \ // Wt + h
PADDL X12, X10; \ // Wt + h + SS1
MOVOU e, X11; \
PXOR f, X11; \
PXOR g, X11; \ // (e XOR f XOR g)
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
PROLD(b, 9); \
MOVOU X13, h; \
PROLD(f, 19); \
MOVOU X10, X13; \
PROLD(X13, 9); \
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2)
PROLD(X10, 17); \
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU X13, d
#define MESSAGE_SCHEDULE(index) \
load4Words(X10, index+1); \ // Wj-3
PROLD(X10, 15); \
load4Words(X11, index-12); \ // Wj-16
PXOR X11, X10; \
load4Words(X11, index-5); \ // Wj-9
PXOR X11, X10; \
MOVOU X10, X11; \
PROLD(X11, 15); \
PXOR X11, X10; \
PROLD(X11, 8); \
PXOR X11, X10; \ // P1
load4Words(X11, index-9); \ // Wj-13
PROLD(X11, 7); \
PXOR X11, X10; \
load4Words(X11, index-2); \ // Wj-6
PXOR X10, X11; \
store4Words(X11, index+4)
#define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \
ROUND_00_11(index, a, b, c, d, e, f, g, h)
#define ROUND_16_63(index, cIndex, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
MOVOU a, X12; \
PROLD(X12, 12); \
MOVOU X12, X13; \ // a <<< 12
MOVOU (cIndex*16)(AX), tmp2; \
PADDL tmp2, X12; \
PADDL e, X12; \
PROLD(X12, 7); \ // SS1
PXOR X12, X13; \ // SS2
; \
MOVOU a, X14; \
POR b, X14; \
MOVOU a, X10; \
PAND b, X10; \
PAND c, X14; \
POR X10, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
load4Words(X10, index); \
PXOR X10, X11; \ //Wt XOR Wt+4
PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
PADDL X14, X13; \ // TT1
; \
PADDL h, X10; \ // Wt + h
PADDL X12, X10; \ // Wt + h + SS1
MOVOU f, X11; \
PXOR g, X11; \
PAND e, X11; \ // (f XOR g) AND e XOR g
PXOR g, X11; \
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
PROLD(b, 9); \
MOVOU X13, h; \
PROLD(f, 19); \
MOVOU X10, X13; \
PROLD(X13, 9); \
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2)
PROLD(X10, 17); \
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU X13, d
// transpose matrix function, AVX/AVX2 version
// parameters:
// - r0: 128/256 bits register as input/output data
// - r1: 128/256 bits register as input/output data
// - r2: 128/256 bits register as input/output data
// - r3: 128/256 bits register as input/output data
// - tmp1: 128/256 bits temp register
// - tmp2: 128/256 bits temp register
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
// avxXorm (mem), reg
// Xor reg to mem using reg-mem xor and store
#define avxXorm(P1, P2) \
VPXOR P1, P2, P2; \
VMOVDQU P2, P1
#define avxStore4Words(W, j) VMOVDQU W, (128+(j)*16)(BX)
#define avxLoad4Words(W, i) VMOVDQU (128+(i)*16)(BX), W
#define avxPrepareFirst16Words(i) \
VMOVDQU (i*16)(R8), X10; \
VMOVDQU (i*16)(R9), X11; \
VMOVDQU (i*16)(R10), X12; \
VMOVDQU (i*16)(R11), X13; \
; \
TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
VPSHUFB flip_mask<>(SB), X10, X10; \
VPSHUFB flip_mask<>(SB), X11, X11; \
VPSHUFB flip_mask<>(SB), X12, X12; \
VPSHUFB flip_mask<>(SB), X13, X13; \
; \
avxStore4Words(X10, 4*i+0); \
avxStore4Words(X11, 4*i+1); \
avxStore4Words(X12, 4*i+2); \
avxStore4Words(X13, 4*i+3)
// r <<< n
#define VPROLD(r, n) \
VPSLLD $(n), r, tmp1; \
VPSRLD $(32-n), r, r; \
VPOR tmp1, r, r
// d = r <<< n
#define VPROLD2(r, d, n) \
VPSLLD $(n), r, tmp1; \
VPSRLD $(32-n), r, d; \
VPOR tmp1, d, d
#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
VPROLD2(a, X13, 12); \ // a <<< 12
VPADDD (index*16)(AX), X13, X12; \
VPADDD e, X12, X12; \
VPROLD(X12, 7); \ // SS1
VPXOR X12, X13, X13; \ // SS2
; \
VPXOR a, b, X14; \
VPXOR c, X14, X14; \ // (a XOR b XOR c)
VPADDD d, X14, X14; \ // (a XOR b XOR c) + d
avxLoad4Words(X10, index); \
avxLoad4Words(X11, index+4); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4
VPADDD X14, X13, X13; \ // TT1
VPADDD h, X10, X10; \ // Wt + h
VPADDD X12, X10, X10; \ // Wt + h + SS1
VPXOR e, f, X11; \
VPXOR g, X11, X11; \ // (e XOR f XOR g)
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
VPROLD(b, 9); \
VMOVDQU X13, h; \
VPROLD(f, 19); \
VPROLD2(X10, X13, 9); \ // tt2 <<< 9
VPXOR X10, X13, X13; \ // tt2 XOR ROTL(9, tt2)
VPROLD(X10, 17); \ // tt2 <<< 17
VPXOR X10, X13, d
#define AVX_MESSAGE_SCHEDULE(index) \
avxLoad4Words(X10, index+1); \ // Wj-3
VPROLD(X10, 15); \
VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16
VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9
; \ // P1
VPROLD2(X10, X11, 15); \
VPXOR X11, X10, X10; \
VPROLD(X11, 8); \
VPXOR X11, X10, X10; \ // P1
avxLoad4Words(X11, index-9); \ // Wj-13
VPROLD(X11, 7); \
VPXOR X11, X10, X10; \
VPXOR (128+(index-2)*16)(BX), X10, X11; \
avxStore4Words(X11, index+4)
#define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \
AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h)
#define AVX_ROUND_16_63(index, cIndex, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
VPROLD2(a, X13, 12); \ // a <<< 12
VPADDD (cIndex*16)(AX), X13, X12; \
VPADDD e, X12, X12; \
VPROLD(X12, 7); \ // SS1
VPXOR X12, X13, X13; \ // SS2
; \
VPOR a, b, X14; \
VPAND a, b, X10; \
VPAND c, X14, X14; \
VPOR X10, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
avxLoad4Words(X10, index); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4
VPADDD X14, X13, X13; \ // TT1
; \
VPADDD h, X10, X10; \ // Wt + h
VPADDD X12, X10, X10; \ // Wt + h + SS1
VPXOR f, g, X11; \
VPAND e, X11, X11; \
VPXOR g, X11, X11; \ // (f XOR g) AND e XOR g
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
VPROLD(b, 9); \
VMOVDQU X13, h; \
VPROLD(f, 19); \
VPROLD2(X10, X13, 9); \ // tt2 <<< 9
VPXOR X10, X13, X13; \ // tt2 XOR ROTL(9, tt2)
VPROLD(X10, 17); \ // tt2 <<< 17
VPXOR X10, X13, d
// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB),NOSPLIT,$0
MOVQ dig+0(FP), DI
MOVQ p+8(FP), SI
MOVQ buffer+16(FP), BX
MOVQ blocks+24(FP), DX
CMPB ·useAVX(SB), $1
JE avx
// load state
MOVQ (DI), R8
MOVOU (0*16)(R8), a
MOVOU (1*16)(R8), e
MOVQ 8(DI), R8
MOVOU (0*16)(R8), b
MOVOU (1*16)(R8), f
MOVQ 16(DI), R8
MOVOU (0*16)(R8), c
MOVOU (1*16)(R8), g
MOVQ 24(DI), R8
MOVOU (0*16)(R8), d
MOVOU (1*16)(R8), h
// transpose state
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
// store state to temporary buffer
storeState
MOVQ $T256_4<>(SB), AX
MOVQ (SI), R8
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
loop:
// load message block
prepareFirst16Words(0)
prepareFirst16Words(1)
prepareFirst16Words(2)
prepareFirst16Words(3)
ROUND_00_11(0, a, b, c, d, e, f, g, h)
ROUND_00_11(1, h, a, b, c, d, e, f, g)
ROUND_00_11(2, g, h, a, b, c, d, e, f)
ROUND_00_11(3, f, g, h, a, b, c, d, e)
ROUND_00_11(4, e, f, g, h, a, b, c, d)
ROUND_00_11(5, d, e, f, g, h, a, b, c)
ROUND_00_11(6, c, d, e, f, g, h, a, b)
ROUND_00_11(7, b, c, d, e, f, g, h, a)
ROUND_00_11(8, a, b, c, d, e, f, g, h)
ROUND_00_11(9, h, a, b, c, d, e, f, g)
ROUND_00_11(10, g, h, a, b, c, d, e, f)
ROUND_00_11(11, f, g, h, a, b, c, d, e)
ROUND_12_15(12, e, f, g, h, a, b, c, d)
ROUND_12_15(13, d, e, f, g, h, a, b, c)
ROUND_12_15(14, c, d, e, f, g, h, a, b)
ROUND_12_15(15, b, c, d, e, f, g, h, a)
ROUND_16_63(16, 16, a, b, c, d, e, f, g, h)
ROUND_16_63(17, 17, h, a, b, c, d, e, f, g)
ROUND_16_63(18, 18, g, h, a, b, c, d, e, f)
ROUND_16_63(19, 19, f, g, h, a, b, c, d, e)
ROUND_16_63(20, 20, e, f, g, h, a, b, c, d)
ROUND_16_63(21, 21, d, e, f, g, h, a, b, c)
ROUND_16_63(22, 22, c, d, e, f, g, h, a, b)
ROUND_16_63(23, 23, b, c, d, e, f, g, h, a)
ROUND_16_63(24, 24, a, b, c, d, e, f, g, h)
ROUND_16_63(25, 25, h, a, b, c, d, e, f, g)
ROUND_16_63(26, 26, g, h, a, b, c, d, e, f)
ROUND_16_63(27, 27, f, g, h, a, b, c, d, e)
ROUND_16_63(28, 28, e, f, g, h, a, b, c, d)
ROUND_16_63(29, 29, d, e, f, g, h, a, b, c)
ROUND_16_63(30, 30, c, d, e, f, g, h, a, b)
ROUND_16_63(31, 31, b, c, d, e, f, g, h, a)
ROUND_16_63(32, 32, a, b, c, d, e, f, g, h)
ROUND_16_63(33, 33, h, a, b, c, d, e, f, g)
ROUND_16_63(34, 34, g, h, a, b, c, d, e, f)
ROUND_16_63(35, 35, f, g, h, a, b, c, d, e)
ROUND_16_63(36, 36, e, f, g, h, a, b, c, d)
ROUND_16_63(37, 37, d, e, f, g, h, a, b, c)
ROUND_16_63(38, 38, c, d, e, f, g, h, a, b)
ROUND_16_63(39, 39, b, c, d, e, f, g, h, a)
ROUND_16_63(40, 40, a, b, c, d, e, f, g, h)
ROUND_16_63(41, 41, h, a, b, c, d, e, f, g)
ROUND_16_63(42, 42, g, h, a, b, c, d, e, f)
ROUND_16_63(43, 43, f, g, h, a, b, c, d, e)
ROUND_16_63(44, 44, e, f, g, h, a, b, c, d)
ROUND_16_63(45, 45, d, e, f, g, h, a, b, c)
ROUND_16_63(46, 46, c, d, e, f, g, h, a, b)
ROUND_16_63(47, 47, b, c, d, e, f, g, h, a)
ROUND_16_63(48, 16, a, b, c, d, e, f, g, h)
ROUND_16_63(49, 17, h, a, b, c, d, e, f, g)
ROUND_16_63(50, 18, g, h, a, b, c, d, e, f)
ROUND_16_63(51, 19, f, g, h, a, b, c, d, e)
ROUND_16_63(52, 20, e, f, g, h, a, b, c, d)
ROUND_16_63(53, 21, d, e, f, g, h, a, b, c)
ROUND_16_63(54, 22, c, d, e, f, g, h, a, b)
ROUND_16_63(55, 23, b, c, d, e, f, g, h, a)
ROUND_16_63(56, 24, a, b, c, d, e, f, g, h)
ROUND_16_63(57, 25, h, a, b, c, d, e, f, g)
ROUND_16_63(58, 26, g, h, a, b, c, d, e, f)
ROUND_16_63(59, 27, f, g, h, a, b, c, d, e)
ROUND_16_63(60, 28, e, f, g, h, a, b, c, d)
ROUND_16_63(61, 29, d, e, f, g, h, a, b, c)
ROUND_16_63(62, 30, c, d, e, f, g, h, a, b)
ROUND_16_63(63, 31, b, c, d, e, f, g, h, a)
xorm( 0(BX), a)
xorm( 16(BX), b)
xorm( 32(BX), c)
xorm( 48(BX), d)
xorm( 64(BX), e)
xorm( 80(BX), f)
xorm( 96(BX), g)
xorm(112(BX), h)
LEAQ 64(R8), R8
LEAQ 64(R9), R9
LEAQ 64(R10), R10
LEAQ 64(R11), R11
DECQ DX
JNZ loop
// transpose state
SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
MOVQ (DI), R8
MOVOU a, (0*16)(R8)
MOVOU e, (1*16)(R8)
MOVQ 8(DI), R8
MOVOU b, (0*16)(R8)
MOVOU f, (1*16)(R8)
MOVQ 16(DI), R8
MOVOU c, (0*16)(R8)
MOVOU g, (1*16)(R8)
MOVQ 24(DI), R8
MOVOU d, (0*16)(R8)
MOVOU h, (1*16)(R8)
RET
avx:
// load state
MOVQ (DI), R8
VMOVDQU (0*16)(R8), a
VMOVDQU (1*16)(R8), e
MOVQ 8(DI), R8
VMOVDQU (0*16)(R8), b
VMOVDQU (1*16)(R8), f
MOVQ 16(DI), R8
VMOVDQU (0*16)(R8), c
VMOVDQU (1*16)(R8), g
MOVQ 24(DI), R8
VMOVDQU (0*16)(R8), d
VMOVDQU (1*16)(R8), h
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
VMOVDQU a, (BX)
VMOVDQU b, 16(BX)
VMOVDQU c, 32(BX)
VMOVDQU d, 48(BX)
VMOVDQU e, 64(BX)
VMOVDQU f, 80(BX)
VMOVDQU g, 96(BX)
VMOVDQU h, 112(BX)
MOVQ $T256_4<>(SB), AX
MOVQ (SI), R8
MOVQ 8(SI), R9
MOVQ 16(SI), R10
MOVQ 24(SI), R11
avxLoop:
// load message block
avxPrepareFirst16Words(0)
avxPrepareFirst16Words(1)
avxPrepareFirst16Words(2)
avxPrepareFirst16Words(3)
AVX_ROUND_00_11(0, a, b, c, d, e, f, g, h)
AVX_ROUND_00_11(1, h, a, b, c, d, e, f, g)
AVX_ROUND_00_11(2, g, h, a, b, c, d, e, f)
AVX_ROUND_00_11(3, f, g, h, a, b, c, d, e)
AVX_ROUND_00_11(4, e, f, g, h, a, b, c, d)
AVX_ROUND_00_11(5, d, e, f, g, h, a, b, c)
AVX_ROUND_00_11(6, c, d, e, f, g, h, a, b)
AVX_ROUND_00_11(7, b, c, d, e, f, g, h, a)
AVX_ROUND_00_11(8, a, b, c, d, e, f, g, h)
AVX_ROUND_00_11(9, h, a, b, c, d, e, f, g)
AVX_ROUND_00_11(10, g, h, a, b, c, d, e, f)
AVX_ROUND_00_11(11, f, g, h, a, b, c, d, e)
AVX_ROUND_12_15(12, e, f, g, h, a, b, c, d)
AVX_ROUND_12_15(13, d, e, f, g, h, a, b, c)
AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b)
AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(16, 16, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(17, 17, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(18, 18, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(19, 19, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(20, 20, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(21, 21, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(22, 22, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(23, 23, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(24, 24, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(25, 25, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(26, 26, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(27, 27, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(28, 28, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(29, 29, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(30, 30, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(31, 31, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(32, 32, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(33, 33, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(34, 34, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(35, 35, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(36, 36, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(37, 37, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(38, 38, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(39, 39, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(40, 40, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(41, 41, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(42, 42, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(43, 43, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(44, 44, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(45, 45, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(46, 46, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(47, 47, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(48, 16, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(49, 17, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(50, 18, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(51, 19, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(52, 20, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(53, 21, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(54, 22, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(55, 23, b, c, d, e, f, g, h, a)
AVX_ROUND_16_63(56, 24, a, b, c, d, e, f, g, h)
AVX_ROUND_16_63(57, 25, h, a, b, c, d, e, f, g)
AVX_ROUND_16_63(58, 26, g, h, a, b, c, d, e, f)
AVX_ROUND_16_63(59, 27, f, g, h, a, b, c, d, e)
AVX_ROUND_16_63(60, 28, e, f, g, h, a, b, c, d)
AVX_ROUND_16_63(61, 29, d, e, f, g, h, a, b, c)
AVX_ROUND_16_63(62, 30, c, d, e, f, g, h, a, b)
AVX_ROUND_16_63(63, 31, b, c, d, e, f, g, h, a)
avxXorm( 0(BX), a)
avxXorm( 16(BX), b)
avxXorm( 32(BX), c)
avxXorm( 48(BX), d)
avxXorm( 64(BX), e)
avxXorm( 80(BX), f)
avxXorm( 96(BX), g)
avxXorm(112(BX), h)
LEAQ 64(R8), R8
LEAQ 64(R9), R9
LEAQ 64(R10), R10
LEAQ 64(R11), R11
DECQ DX
JNZ avxLoop
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
MOVQ (DI), R8
VMOVDQU a, (0*16)(R8)
VMOVDQU e, (1*16)(R8)
MOVQ 8(DI), R8
VMOVDQU b, (0*16)(R8)
VMOVDQU f, (1*16)(R8)
MOVQ 16(DI), R8
VMOVDQU c, (0*16)(R8)
VMOVDQU g, (1*16)(R8)
MOVQ 24(DI), R8
VMOVDQU d, (0*16)(R8)
VMOVDQU h, (1*16)(R8)
RET

117
sm3/sm3blocks_test.go Normal file
View File

@ -0,0 +1,117 @@
//go:build (amd64 || arm64) && !purego
package sm3
import (
"fmt"
"testing"
)
func initState4() [4]*[8]uint32 {
d := new(digest)
d.Reset()
var dig1 = d.h
var dig2 = d.h
var dig3 = d.h
return [4]*[8]uint32{&d.h, &dig1, &dig2, &dig3}
}
func createOneBlockBy4() [4]*byte {
var p1 [64]byte
p1[0] = 0x61
p1[1] = 0x62
p1[2] = 0x63
p1[3] = 0x80
p1[63] = 0x18
var p2 = p1
var p3 = p1
var p4 = p1
return [4]*byte{&p1[0], &p2[0], &p3[0], &p4[0]}
}
func createTwoBlocksBy4() [4]*byte {
var p1 [128]byte
p1[0] = 0x61
p1[1] = 0x62
p1[2] = 0x63
p1[3] = 0x64
copy(p1[4:], p1[:4])
copy(p1[8:], p1[:8])
copy(p1[16:], p1[:16])
copy(p1[32:], p1[:32])
p1[64] = 0x80
p1[126] = 0x02
var p2 = p1
var p3 = p1
var p4 = p1
return [4]*byte{&p1[0], &p2[0], &p3[0], &p4[0]}
}
func TestBlockMultBy4(t *testing.T) {
digs := initState4()
p := createOneBlockBy4()
buffer := make([]byte, 1216)
blockMultBy4(&digs[0], &p[0], &buffer[0], 1)
expected := "[66c7f0f4 62eeedd9 d1f2d46b dc10e4e2 4167c487 5cf2f7a2 297da02b 8f4ba8e0]"
s := fmt.Sprintf("%x", digs[0][:])
if s != expected {
t.Errorf("digs[0] got %s", s)
}
s = fmt.Sprintf("%x", digs[1][:])
if s != expected {
t.Errorf("digs[1] got %s", s)
}
s = fmt.Sprintf("%x", digs[2][:])
if s != expected {
t.Errorf("digs[2] got %s", s)
}
s = fmt.Sprintf("%x", digs[3][:])
if s != expected {
t.Errorf("digs[3] got %s", s)
}
digs = initState4()
p = createTwoBlocksBy4()
blockMultBy4(&digs[0], &p[0], &buffer[0], 2)
expected = "[debe9ff9 2275b8a1 38604889 c18e5a4d 6fdb70e5 387e5765 293dcba3 9c0c5732]"
s = fmt.Sprintf("%x", digs[0][:])
if s != expected {
t.Errorf("digs[0] got %s", s)
}
s = fmt.Sprintf("%x", digs[1][:])
if s != expected {
t.Errorf("digs[1] got %s", s)
}
s = fmt.Sprintf("%x", digs[2][:])
if s != expected {
t.Errorf("digs[2] got %s", s)
}
s = fmt.Sprintf("%x", digs[3][:])
if s != expected {
t.Errorf("digs[3] got %s", s)
}
}
func BenchmarkOneBlockBy4(b *testing.B) {
digs := initState4()
p := createOneBlockBy4()
buffer := make([]byte, 1216)
b.SetBytes(64 * 4)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
blockMultBy4(&digs[0], &p[0], &buffer[0], 1)
}
}
func BenchmarkTwoBlocksBy4(b *testing.B) {
digs := initState4()
p := createTwoBlocksBy4()
buffer := make([]byte, 1216)
b.SetBytes(64 * 2 * 4)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
blockMultBy4(&digs[0], &p[0], &buffer[0], 2)
}
}