gmsm/sm3/sm3blocks_arm64.s

// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

//go:build !purego

#include "textflag.h"
#include "sm3_const_asm.s"

#define a V0
#define e V1
#define b V2
#define f V3
#define c V4
#define g V5
#define d V6
#define h V7

#define tmp1 V8
#define tmp2 V9
#define tmp3 V10
#define tmp4 V11

#define aSave V24
#define bSave V25
#define cSave V26
#define dSave V27
#define eSave V28
#define fSave V29
#define gSave V30
#define hSave V31

// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
// output: from high to low
// t0 = t3.S0, t2.S0, t1.S0, t0.S0
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
	VZIP1 t1.S4, t0.S4, RTMP0.S4               \
	VZIP1 t3.S4, t2.S4, RTMP1.S4               \
	VZIP2 t1.S4, t0.S4, RTMP2.S4               \
	VZIP2 t3.S4, t2.S4, RTMP3.S4               \
	VZIP1 RTMP1.D2, RTMP0.D2, t0.D2            \
	VZIP2 RTMP1.D2, RTMP0.D2, t1.D2            \
	VZIP1 RTMP3.D2, RTMP2.D2, t2.D2            \
	VZIP2 RTMP3.D2, RTMP2.D2, t3.D2            \

// r = s <<< n
#define PROLD(s, r, n) \
	VSHL $(n), s.S4, r.S4     \
	VSRI $(32-n), s.S4, r.S4  \

#define loadWordByIndex(W, i) \
	ADD $(16*(i)), wordStart, R20 \
	VLD1 (R20), [W.S4]          \ 

#define prepare4Words \
	VLD1.P 16(srcPtr1), [V12.B16] \
	VLD1.P 16(srcPtr2), [V13.B16] \
	VLD1.P 16(srcPtr3), [V14.B16] \
	VLD1.P 16(srcPtr4), [V15.B16] \	
	TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \
	VREV32 V12.B16, V12.B16; \
	VREV32 V13.B16, V13.B16; \
	VREV32 V14.B16, V14.B16; \
	VREV32 V15.B16, V15.B16; \
	VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr)

#define LOAD_T(const, T) \
	MOVW $const, R20     \
	VDUP R20, T.S4       \

#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
	PROLD(a, V12, 12)                \
	VMOV V12.B16, V13.B16            \
	LOAD_T(const, tmp1)              \
	VADD tmp1.S4, V12.S4, V12.S4     \
	VADD e.S4, V12.S4, V12.S4        \
	PROLD(V12, V14, 7)               \ // V14 = SS1
	VEOR V14.B16, V13.B16, V12.B16   \ // V12 = SS2
	VEOR a.B16, b.B16, V13.B16       \
	VEOR c.B16, V13.B16, V13.B16     \
	VADD V13.S4, d.S4, V13.S4        \ // V13 = (a XOR b XOR c) + d 
	loadWordByIndex(V10, index)      \
	loadWordByIndex(V11, index+4)    \
	VEOR V10.B16, V11.B16, V11.B16   \
	VADD V11.S4, V13.S4, V13.S4      \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
	VADD V13.S4, V12.S4, V13.S4      \ // TT1
	VADD h.S4, V10.S4, V10.S4        \
	VADD V14.S4, V10.S4, V10.S4      \ // Wt + h + SS1
	VEOR e.B16, f.B16, V11.B16       \
	VEOR g.B16, V11.B16, V11.B16     \
	VADD V11.S4, V10.S4, V10.S4      \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
	VMOV b.B16, V11.B16              \
	PROLD(V11, b, 9)                 \ // b = b <<< 9
	VMOV V13.B16, h.B16              \ // h = TT1
	VMOV f.B16, V11.B16              \
	PROLD(V11, f, 19)                \ // f = f <<< 19
	PROLD(V10, V11, 9)               \ // V11 = TT2 <<< 9
	PROLD(V11, V12, 8)               \ // V12 = TT2 <<< 17
	VEOR V10.B16, V11.B16, V11.B16   \ // V11 = TT2 XOR (TT2 <<< 9)
	VEOR V11.B16, V12.B16, d.B16     \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)

#define MESSAGE_SCHEDULE(index) \
	loadWordByIndex(V10, index+1)    \ // Wj-3
	PROLD(V10, V11, 15)              \
	loadWordByIndex(V10, index-12)   \ // Wj-16
	VEOR V10.B16, V11.B16, V10.B16   \
	loadWordByIndex(V11, index-5)    \ // Wj-9
	VEOR V10.B16, V11.B16, V10.B16   \
	PROLD(V10, V11, 15)              \
	PROLD(V11, V12, 8)               \
	VEOR V11.B16, V10.B16, V10.B16   \
	VEOR V12.B16, V10.B16, V10.B16   \ // P1
	loadWordByIndex(V11, index-9)    \ // Wj-13
	PROLD(V11, V12, 7)               \
	VEOR V12.B16, V10.B16, V10.B16   \
	loadWordByIndex(V11, index-2)    \ // Wj-6
	VEOR V11.B16, V10.B16, V11.B16   \
	VST1.P [V11.S4], 16(wordPtr)     \

#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
	MESSAGE_SCHEDULE(index)                               \
	ROUND_00_11(index, const, a, b, c, d, e, f, g, h)     \

#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
	MESSAGE_SCHEDULE(index)          \ // V11 is Wt+4 now, Pls do not use it
	PROLD(a, V12, 12)                \
	VMOV V12.B16, V13.B16            \
	LOAD_T(const, tmp1)              \
	VADD tmp1.S4, V12.S4, V12.S4     \
	VADD e.S4, V12.S4, V12.S4        \
	PROLD(V12, V14, 7)               \ // V14 = SS1
	VEOR V14.B16, V13.B16, V12.B16   \ // V12 = SS2
	VORR a.B16, b.B16, V10.B16       \
	VAND a.B16, b.B16, V13.B16       \
	VAND c.B16, V10.B16, V10.B16     \
	VORR V13.B16, V10.B16, V13.B16   \ // (a AND b) OR (a AND c) OR (b AND c)
	VADD V13.S4, d.S4, V13.S4        \ // (a AND b) OR (a AND c) OR (b AND c) + d
	loadWordByIndex(V10, index)      \ // Wj
	VEOR V10.B16, V11.B16, V11.B16   \ // Wj XOR Wj+4
	VADD V13.S4, V11.S4, V13.S4      \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
	VADD V13.S4, V12.S4, V13.S4      \ // TT1
	VADD h.S4, V10.S4, V10.S4        \ // Wt + h
	VADD V14.S4, V10.S4, V10.S4      \ // Wt + h + SS1
	VEOR f.B16, g.B16, V11.B16       \
	VAND V11.B16, e.B16, V11.B16     \
	VEOR g.B16, V11.B16, V11.B16     \ // (f XOR g) AND e XOR g
	VADD V10.S4, V11.S4, V10.S4      \ // TT2
	VMOV b.B16, V11.B16              \
	PROLD(V11, b, 9)                 \ // b = b <<< 9
	VMOV V13.B16, h.B16              \ // h = TT1
	VMOV f.B16, V11.B16              \
	PROLD(V11, f, 19)                \ // f = f <<< 19
	PROLD(V10, V11, 9)               \ // V11 = TT2 <<< 9
	PROLD(V11, V12, 8)               \ // V12 = TT2 <<< 17
	VEOR V10.B16, V11.B16, V11.B16   \ // V11 = TT2 XOR (TT2 <<< 9)
	VEOR V11.B16, V12.B16, d.B16     \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)

// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB), NOSPLIT, $0
#define digPtr R0
#define srcPtrPtr R1
#define blockCount R3
#define digSave R4
#define wordStart R5
#define srcPtr1 R6
#define srcPtr2 R7
#define srcPtr3 R8
#define srcPtr4 R9
#define wordPtr R10
	MOVD	dig+0(FP), digPtr
	MOVD	p+8(FP), srcPtrPtr
	MOVD	buffer+16(FP), wordStart
	MOVD	blocks+24(FP), blockCount

	// load state
	MOVD digPtr, digSave
	MOVD.P 8(digPtr), R20
	VLD1 (R20), [a.S4, e.S4]
	MOVD.P 8(digPtr), R20
	VLD1 (R20), [b.S4, f.S4]
	MOVD.P 8(digPtr), R20
	VLD1 (R20), [c.S4, g.S4]
	MOVD (digPtr), R20
	VLD1 (R20), [d.S4, h.S4]

	// transpose state
	TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
	TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)

	MOVD.P 8(srcPtrPtr), srcPtr1
	MOVD.P 8(srcPtrPtr), srcPtr2
	MOVD.P 8(srcPtrPtr), srcPtr3
	MOVD (srcPtrPtr), srcPtr4

loop:
	// save state
	VMOV a.B16, aSave.B16
	VMOV b.B16, bSave.B16
	VMOV c.B16, cSave.B16
	VMOV d.B16, dSave.B16
	VMOV e.B16, eSave.B16
	VMOV f.B16, fSave.B16
	VMOV g.B16, gSave.B16
	VMOV h.B16, hSave.B16

	// reset wordPtr
	MOVD wordStart, wordPtr

	// load message block
	prepare4Words
	prepare4Words
	prepare4Words
	prepare4Words

	ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
	ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
	ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
	ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
	ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
	ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
	ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
	ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
	ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
	ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
	ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
	ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)

	ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
	ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
	ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
	ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)

	ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
	ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
	ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
	ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
	ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
	ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
	ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
	ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
	ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
	ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
	ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
	ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
	ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
	ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
	ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
	ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
	ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
	ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
	ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
	ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
	ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
	ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
	ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
	ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
	ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
	ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
	ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
	ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
	ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
	ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
	ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
	ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
	ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
	ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
	ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
	ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
	ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
	ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
	ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
	ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
	ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
	ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
	ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
	ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
	ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
	ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
	ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
	ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)

	VEOR a.B16, aSave.B16, a.B16
	VEOR b.B16, bSave.B16, b.B16
	VEOR c.B16, cSave.B16, c.B16
	VEOR d.B16, dSave.B16, d.B16
	VEOR e.B16, eSave.B16, e.B16
	VEOR f.B16, fSave.B16, f.B16
	VEOR g.B16, gSave.B16, g.B16
	VEOR h.B16, hSave.B16, h.B16

	SUB $1, blockCount
	CBNZ blockCount, loop

	// transpose state
	TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
	TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)

	MOVD.P 8(digSave), R20
	VST1 [a.S4, e.S4], (R20)
	MOVD.P 8(digSave), R20
	VST1 [b.S4, f.S4], (R20)
	MOVD.P 8(digSave), R20
	VST1 [c.S4, g.S4], (R20)
	MOVD (digSave), R20
	VST1 [d.S4, h.S4], (R20)

	RET

#undef digPtr
#undef a
#undef b
#undef c
#undef d
#undef e
#undef f
#undef g
#undef h

#define a V0
#define b V1
#define c V2
#define d V3
#define e V4
#define f V5
#define g V6
#define h V7
// func copyResultsBy4(dig *uint32, dst *byte)
TEXT ·copyResultsBy4(SB),NOSPLIT,$0
#define digPtr R0
#define dstPtr R1
	MOVD	dig+0(FP), digPtr
	MOVD	dst+8(FP), dstPtr

	// load state
	VLD1.P 64(digPtr), [a.S4, b.S4, c.S4, d.S4]
	VLD1 (digPtr), [e.S4, f.S4, g.S4, h.S4]

	VREV32 a.B16, a.B16
	VREV32 b.B16, b.B16
	VREV32 c.B16, c.B16
	VREV32 d.B16, d.B16
	VREV32 e.B16, e.B16
	VREV32 f.B16, f.B16
	VREV32 g.B16, g.B16
	VREV32 h.B16, h.B16

	VST1.P [a.B16, b.B16, c.B16, d.B16], 64(dstPtr)
	VST1 [e.B16, f.B16, g.B16, h.B16], (dstPtr)

	RET
sm3: s390x kdf v1 2024-09-04 11:24:56 +08:00			`// Copyright 2024 Sun Yimin. All rights reserved.`
			`// Use of this source code is governed by a MIT-style`
			`// license that can be found in the LICENSE file.`

internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`//go:build !purego`

			`#include "textflag.h"`
			`#include "sm3_const_asm.s"`

			`#define a V0`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`#define e V1`
			`#define b V2`
			`#define f V3`
			`#define c V4`
			`#define g V5`
			`#define d V6`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`#define h V7`

			`#define tmp1 V8`
			`#define tmp2 V9`
			`#define tmp3 V10`
			`#define tmp4 V11`

sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`#define aSave V24`
			`#define bSave V25`
			`#define cSave V26`
			`#define dSave V27`
			`#define eSave V28`
			`#define fSave V29`
			`#define gSave V30`
			`#define hSave V31`

internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`// input: from high to low`
			`// t0 = t0.S3, t0.S2, t0.S1, t0.S0`
			`// t1 = t1.S3, t1.S2, t1.S1, t1.S0`
			`// t2 = t2.S3, t2.S2, t2.S1, t2.S0`
			`// t3 = t3.S3, t3.S2, t3.S1, t3.S0`
			`// output: from high to low`
			`// t0 = t3.S0, t2.S0, t1.S0, t0.S0`
			`// t1 = t3.S1, t2.S1, t1.S1, t0.S1`
			`// t2 = t3.S2, t2.S2, t1.S2, t0.S2`
			`// t3 = t3.S3, t2.S3, t1.S3, t0.S3`
			`#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \`
			`VZIP1 t1.S4, t0.S4, RTMP0.S4 \`
			`VZIP1 t3.S4, t2.S4, RTMP1.S4 \`
			`VZIP2 t1.S4, t0.S4, RTMP2.S4 \`
			`VZIP2 t3.S4, t2.S4, RTMP3.S4 \`
			`VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \`
			`VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \`
			`VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \`
			`VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 \`

			`// r = s <<< n`
			`#define PROLD(s, r, n) \`
			`VSHL $(n), s.S4, r.S4 \`
			`VSRI $(32-n), s.S4, r.S4 \`

			`#define loadWordByIndex(W, i) \`
			`ADD $(16*(i)), wordStart, R20 \`
			`VLD1 (R20), [W.S4] \`

			`#define prepare4Words \`
			`VLD1.P 16(srcPtr1), [V12.B16] \`
			`VLD1.P 16(srcPtr2), [V13.B16] \`
			`VLD1.P 16(srcPtr3), [V14.B16] \`
			`VLD1.P 16(srcPtr4), [V15.B16] \`
			`TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \`
			`VREV32 V12.B16, V12.B16; \`
			`VREV32 V13.B16, V13.B16; \`
			`VREV32 V14.B16, V14.B16; \`
			`VREV32 V15.B16, V15.B16; \`
			`VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr)`

			`#define LOAD_T(const, T) \`
			`MOVW $const, R20 \`
			`VDUP R20, T.S4 \`

			`#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \`
			`PROLD(a, V12, 12) \`
			`VMOV V12.B16, V13.B16 \`
			`LOAD_T(const, tmp1) \`
			`VADD tmp1.S4, V12.S4, V12.S4 \`
			`VADD e.S4, V12.S4, V12.S4 \`
			`PROLD(V12, V14, 7) \ // V14 = SS1`
			`VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2`
			`VEOR a.B16, b.B16, V13.B16 \`
			`VEOR c.B16, V13.B16, V13.B16 \`
			`VADD V13.S4, d.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d`
			`loadWordByIndex(V10, index) \`
			`loadWordByIndex(V11, index+4) \`
			`VEOR V10.B16, V11.B16, V11.B16 \`
			`VADD V11.S4, V13.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4)`
			`VADD V13.S4, V12.S4, V13.S4 \ // TT1`
			`VADD h.S4, V10.S4, V10.S4 \`
			`VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1`
			`VEOR e.B16, f.B16, V11.B16 \`
			`VEOR g.B16, V11.B16, V11.B16 \`
			`VADD V11.S4, V10.S4, V10.S4 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1`
			`VMOV b.B16, V11.B16 \`
			`PROLD(V11, b, 9) \ // b = b <<< 9`
			`VMOV V13.B16, h.B16 \ // h = TT1`
			`VMOV f.B16, V11.B16 \`
			`PROLD(V11, f, 19) \ // f = f <<< 19`
			`PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9`
			`PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17`
			`VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)`
			`VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)`

			`#define MESSAGE_SCHEDULE(index) \`
			`loadWordByIndex(V10, index+1) \ // Wj-3`
			`PROLD(V10, V11, 15) \`
			`loadWordByIndex(V10, index-12) \ // Wj-16`
			`VEOR V10.B16, V11.B16, V10.B16 \`
			`loadWordByIndex(V11, index-5) \ // Wj-9`
			`VEOR V10.B16, V11.B16, V10.B16 \`
			`PROLD(V10, V11, 15) \`
			`PROLD(V11, V12, 8) \`
			`VEOR V11.B16, V10.B16, V10.B16 \`
			`VEOR V12.B16, V10.B16, V10.B16 \ // P1`
			`loadWordByIndex(V11, index-9) \ // Wj-13`
			`PROLD(V11, V12, 7) \`
			`VEOR V12.B16, V10.B16, V10.B16 \`
			`loadWordByIndex(V11, index-2) \ // Wj-6`
			`VEOR V11.B16, V10.B16, V11.B16 \`
			`VST1.P [V11.S4], 16(wordPtr) \`

			`#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \`
			`MESSAGE_SCHEDULE(index) \`
			`ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \`

			`#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \`
			`MESSAGE_SCHEDULE(index) \ // V11 is Wt+4 now, Pls do not use it`
			`PROLD(a, V12, 12) \`
			`VMOV V12.B16, V13.B16 \`
			`LOAD_T(const, tmp1) \`
			`VADD tmp1.S4, V12.S4, V12.S4 \`
			`VADD e.S4, V12.S4, V12.S4 \`
			`PROLD(V12, V14, 7) \ // V14 = SS1`
			`VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2`
			`VORR a.B16, b.B16, V10.B16 \`
			`VAND a.B16, b.B16, V13.B16 \`
			`VAND c.B16, V10.B16, V10.B16 \`
			`VORR V13.B16, V10.B16, V13.B16 \ // (a AND b) OR (a AND c) OR (b AND c)`
			`VADD V13.S4, d.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d`
			`loadWordByIndex(V10, index) \ // Wj`
			`VEOR V10.B16, V11.B16, V11.B16 \ // Wj XOR Wj+4`
			`VADD V13.S4, V11.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)`
			`VADD V13.S4, V12.S4, V13.S4 \ // TT1`
			`VADD h.S4, V10.S4, V10.S4 \ // Wt + h`
			`VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1`
			`VEOR f.B16, g.B16, V11.B16 \`
			`VAND V11.B16, e.B16, V11.B16 \`
			`VEOR g.B16, V11.B16, V11.B16 \ // (f XOR g) AND e XOR g`
			`VADD V10.S4, V11.S4, V10.S4 \ // TT2`
			`VMOV b.B16, V11.B16 \`
			`PROLD(V11, b, 9) \ // b = b <<< 9`
			`VMOV V13.B16, h.B16 \ // h = TT1`
			`VMOV f.B16, V11.B16 \`
			`PROLD(V11, f, 19) \ // f = f <<< 19`
			`PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9`
			`PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17`
			`VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)`
			`VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)`

			`// blockMultBy4(dig [8]uint32, p byte, buffer *byte, blocks int)`
			`TEXT ·blockMultBy4(SB), NOSPLIT, $0`
			`#define digPtr R0`
			`#define srcPtrPtr R1`
			`#define blockCount R3`
			`#define digSave R4`
			`#define wordStart R5`
			`#define srcPtr1 R6`
			`#define srcPtr2 R7`
			`#define srcPtr3 R8`
			`#define srcPtr4 R9`
			`#define wordPtr R10`
			`MOVD dig+0(FP), digPtr`
			`MOVD p+8(FP), srcPtrPtr`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`MOVD buffer+16(FP), wordStart`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD blocks+24(FP), blockCount`

			`// load state`
			`MOVD digPtr, digSave`
			`MOVD.P 8(digPtr), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VLD1 (R20), [a.S4, e.S4]`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD.P 8(digPtr), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VLD1 (R20), [b.S4, f.S4]`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD.P 8(digPtr), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VLD1 (R20), [c.S4, g.S4]`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD (digPtr), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VLD1 (R20), [d.S4, h.S4]`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00
			`// transpose state`
			`TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)`
			`TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)`

			`MOVD.P 8(srcPtrPtr), srcPtr1`
			`MOVD.P 8(srcPtrPtr), srcPtr2`
			`MOVD.P 8(srcPtrPtr), srcPtr3`
			`MOVD (srcPtrPtr), srcPtr4`

			`loop:`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`// save state`
			`VMOV a.B16, aSave.B16`
			`VMOV b.B16, bSave.B16`
			`VMOV c.B16, cSave.B16`
			`VMOV d.B16, dSave.B16`
			`VMOV e.B16, eSave.B16`
			`VMOV f.B16, fSave.B16`
			`VMOV g.B16, gSave.B16`
			`VMOV h.B16, hSave.B16`

internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`// reset wordPtr`
			`MOVD wordStart, wordPtr`

			`// load message block`
			`prepare4Words`
			`prepare4Words`
			`prepare4Words`
			`prepare4Words`

			`ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)`
			`ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)`
			`ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)`
			`ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)`
			`ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)`
			`ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)`
			`ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)`
			`ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)`
			`ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)`
			`ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)`
			`ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)`
			`ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)`

			`ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)`
			`ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)`
			`ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)`
			`ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)`

			`ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)`
			`ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)`
			`ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)`
			`ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)`
			`ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)`
			`ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)`
			`ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)`
			`ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)`
			`ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)`
			`ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)`
			`ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)`
			`ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)`
			`ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)`
			`ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)`
			`ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)`
			`ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)`
			`ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)`
			`ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)`
			`ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)`
			`ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)`
			`ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)`
			`ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)`
			`ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)`
			`ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)`
			`ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)`
			`ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)`
			`ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)`
			`ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)`
			`ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)`
			`ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)`
			`ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)`
			`ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)`
			`ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)`
			`ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)`
			`ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)`
			`ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)`
			`ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)`
			`ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)`
			`ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)`
			`ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)`
			`ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)`
			`ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)`
			`ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)`
			`ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)`
			`ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)`
			`ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)`
			`ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)`
			`ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)`

sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VEOR a.B16, aSave.B16, a.B16`
			`VEOR b.B16, bSave.B16, b.B16`
			`VEOR c.B16, cSave.B16, c.B16`
			`VEOR d.B16, dSave.B16, d.B16`
			`VEOR e.B16, eSave.B16, e.B16`
			`VEOR f.B16, fSave.B16, f.B16`
			`VEOR g.B16, gSave.B16, g.B16`
			`VEOR h.B16, hSave.B16, h.B16`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00
			`SUB $1, blockCount`
			`CBNZ blockCount, loop`

			`// transpose state`
			`TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)`
			`TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)`

			`MOVD.P 8(digSave), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VST1 [a.S4, e.S4], (R20)`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD.P 8(digSave), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VST1 [b.S4, f.S4], (R20)`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD.P 8(digSave), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VST1 [c.S4, g.S4], (R20)`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`MOVD (digSave), R20`
sm3: kdf arm64 optimization 2024-09-04 16:34:43 +08:00			`VST1 [d.S4, h.S4], (R20)`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00
			`RET`

			`#undef digPtr`
sm3: kdf arm64, fix marcro issue 2024-09-04 16:44:57 +08:00			`#undef a`
			`#undef b`
			`#undef c`
			`#undef d`
			`#undef e`
			`#undef f`
			`#undef g`
			`#undef h`

			`#define a V0`
			`#define b V1`
			`#define c V2`
			`#define d V3`
			`#define e V4`
			`#define f V5`
			`#define g V6`
			`#define h V7`
internal/cryptotest: add tests for the hash.Hash cipher.BlockMode cipher.Block interfaces 2024-08-02 13:02:25 +08:00			`// func copyResultsBy4(dig uint32, dst byte)`
			`TEXT ·copyResultsBy4(SB),NOSPLIT,$0`
			`#define digPtr R0`
			`#define dstPtr R1`
			`MOVD dig+0(FP), digPtr`
			`MOVD dst+8(FP), dstPtr`

			`// load state`
			`VLD1.P 64(digPtr), [a.S4, b.S4, c.S4, d.S4]`
			`VLD1 (digPtr), [e.S4, f.S4, g.S4, h.S4]`

			`VREV32 a.B16, a.B16`
			`VREV32 b.B16, b.B16`
			`VREV32 c.B16, c.B16`
			`VREV32 d.B16, d.B16`
			`VREV32 e.B16, e.B16`
			`VREV32 f.B16, f.B16`
			`VREV32 g.B16, g.B16`
			`VREV32 h.B16, h.B16`

			`VST1.P [a.B16, b.B16, c.B16, d.B16], 64(dstPtr)`
			`VST1 [e.B16, f.B16, g.B16, h.B16], (dstPtr)`

			`RET`