diff --git a/sm3/kdf_mult_asm.go b/sm3/kdf_mult_asm.go index b289e46..520867c 100644 --- a/sm3/kdf_mult_asm.go +++ b/sm3/kdf_mult_asm.go @@ -28,9 +28,10 @@ func copyResult(result []byte, dig *[8]uint32) { binary.BigEndian.PutUint32(result[28:], dig[7]) } +// state || words // 1216 = 68 * 4 * 4 + 8 * 4 * 4 = 76 * 16 -// 64 * 2 * 4 = 512 -const preallocSize = 1728 +// 8 * 16 = 128 +const preallocSize = 1344 func kdfBy4(baseMD *digest, keyLen int, limit int) []byte { if limit < 4 { diff --git a/sm3/sm3blocks_arm64.s b/sm3/sm3blocks_arm64.s index 09e7803..ec35b77 100644 --- a/sm3/sm3blocks_arm64.s +++ b/sm3/sm3blocks_arm64.s @@ -37,10 +37,10 @@ VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \ VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 \ -// d = s <<< n -#define PROLD(s, d, n) \ - VSHL $(n), s.S4, d.S4 \ - VSRI $(32-n), s.S4, d.S4 \ +// r = s <<< n +#define PROLD(s, r, n) \ + VSHL $(n), s.S4, r.S4 \ + VSRI $(32-n), s.S4, r.S4 \ #define loadWordByIndex(W, i) \ ADD $(16*(i)), wordStart, R20 \ @@ -191,7 +191,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0 VST1.P [a.S4, b.S4, c.S4, d.S4], 64(wordStart) VST1.P [e.S4, f.S4, g.S4, h.S4], 64(wordStart) MOVD wordStart, wordPtr -/* + MOVD.P 8(srcPtrPtr), srcPtr1 MOVD.P 8(srcPtrPtr), srcPtr2 MOVD.P 8(srcPtrPtr), srcPtr3 @@ -305,5 +305,5 @@ loop: MOVD (digSave), R20 VST1.P [d.S4], 16(R20) VST1 [h.S4], (R20) -*/ + RET