gmsm/sm3/sm3blocks_arm64.s

358 lines
11 KiB
ArmAsm
Raw Normal View History

2024-09-04 11:24:56 +08:00
// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
//go:build !purego
#include "textflag.h"
#include "sm3_const_asm.s"
#define a V0
2024-09-04 16:34:43 +08:00
#define e V1
#define b V2
#define f V3
#define c V4
#define g V5
#define d V6
#define h V7
#define tmp1 V8
#define tmp2 V9
#define tmp3 V10
#define tmp4 V11
2024-09-04 16:34:43 +08:00
#define aSave V24
#define bSave V25
#define cSave V26
#define dSave V27
#define eSave V28
#define fSave V29
#define gSave V30
#define hSave V31
// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
// output: from high to low
// t0 = t3.S0, t2.S0, t1.S0, t0.S0
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
VZIP1 t1.S4, t0.S4, RTMP0.S4 \
VZIP1 t3.S4, t2.S4, RTMP1.S4 \
VZIP2 t1.S4, t0.S4, RTMP2.S4 \
VZIP2 t3.S4, t2.S4, RTMP3.S4 \
VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \
VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \
VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \
VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 \
// r = s <<< n
#define PROLD(s, r, n) \
VSHL $(n), s.S4, r.S4 \
VSRI $(32-n), s.S4, r.S4 \
#define loadWordByIndex(W, i) \
ADD $(16*(i)), wordStart, R20 \
VLD1 (R20), [W.S4] \
#define prepare4Words \
VLD1.P 16(srcPtr1), [V12.B16] \
VLD1.P 16(srcPtr2), [V13.B16] \
VLD1.P 16(srcPtr3), [V14.B16] \
VLD1.P 16(srcPtr4), [V15.B16] \
TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \
VREV32 V12.B16, V12.B16; \
VREV32 V13.B16, V13.B16; \
VREV32 V14.B16, V14.B16; \
VREV32 V15.B16, V15.B16; \
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr)
#define LOAD_T(const, T) \
MOVW $const, R20 \
VDUP R20, T.S4 \
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
PROLD(a, V12, 12) \
VMOV V12.B16, V13.B16 \
LOAD_T(const, tmp1) \
VADD tmp1.S4, V12.S4, V12.S4 \
VADD e.S4, V12.S4, V12.S4 \
PROLD(V12, V14, 7) \ // V14 = SS1
VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2
VEOR a.B16, b.B16, V13.B16 \
VEOR c.B16, V13.B16, V13.B16 \
VADD V13.S4, d.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d
loadWordByIndex(V10, index) \
loadWordByIndex(V11, index+4) \
VEOR V10.B16, V11.B16, V11.B16 \
VADD V11.S4, V13.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
VADD V13.S4, V12.S4, V13.S4 \ // TT1
VADD h.S4, V10.S4, V10.S4 \
VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1
VEOR e.B16, f.B16, V11.B16 \
VEOR g.B16, V11.B16, V11.B16 \
VADD V11.S4, V10.S4, V10.S4 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
VMOV b.B16, V11.B16 \
PROLD(V11, b, 9) \ // b = b <<< 9
VMOV V13.B16, h.B16 \ // h = TT1
VMOV f.B16, V11.B16 \
PROLD(V11, f, 19) \ // f = f <<< 19
PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9
PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17
VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)
VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
#define MESSAGE_SCHEDULE(index) \
loadWordByIndex(V10, index+1) \ // Wj-3
PROLD(V10, V11, 15) \
loadWordByIndex(V10, index-12) \ // Wj-16
VEOR V10.B16, V11.B16, V10.B16 \
loadWordByIndex(V11, index-5) \ // Wj-9
VEOR V10.B16, V11.B16, V10.B16 \
PROLD(V10, V11, 15) \
PROLD(V11, V12, 8) \
VEOR V11.B16, V10.B16, V10.B16 \
VEOR V12.B16, V10.B16, V10.B16 \ // P1
loadWordByIndex(V11, index-9) \ // Wj-13
PROLD(V11, V12, 7) \
VEOR V12.B16, V10.B16, V10.B16 \
loadWordByIndex(V11, index-2) \ // Wj-6
VEOR V11.B16, V10.B16, V11.B16 \
VST1.P [V11.S4], 16(wordPtr) \
#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index) \
ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index) \ // V11 is Wt+4 now, Pls do not use it
PROLD(a, V12, 12) \
VMOV V12.B16, V13.B16 \
LOAD_T(const, tmp1) \
VADD tmp1.S4, V12.S4, V12.S4 \
VADD e.S4, V12.S4, V12.S4 \
PROLD(V12, V14, 7) \ // V14 = SS1
VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2
VORR a.B16, b.B16, V10.B16 \
VAND a.B16, b.B16, V13.B16 \
VAND c.B16, V10.B16, V10.B16 \
VORR V13.B16, V10.B16, V13.B16 \ // (a AND b) OR (a AND c) OR (b AND c)
VADD V13.S4, d.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d
loadWordByIndex(V10, index) \ // Wj
VEOR V10.B16, V11.B16, V11.B16 \ // Wj XOR Wj+4
VADD V13.S4, V11.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
VADD V13.S4, V12.S4, V13.S4 \ // TT1
VADD h.S4, V10.S4, V10.S4 \ // Wt + h
VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1
VEOR f.B16, g.B16, V11.B16 \
VAND V11.B16, e.B16, V11.B16 \
VEOR g.B16, V11.B16, V11.B16 \ // (f XOR g) AND e XOR g
VADD V10.S4, V11.S4, V10.S4 \ // TT2
VMOV b.B16, V11.B16 \
PROLD(V11, b, 9) \ // b = b <<< 9
VMOV V13.B16, h.B16 \ // h = TT1
VMOV f.B16, V11.B16 \
PROLD(V11, f, 19) \ // f = f <<< 19
PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9
PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17
VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)
VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB), NOSPLIT, $0
#define digPtr R0
#define srcPtrPtr R1
#define blockCount R3
#define digSave R4
#define wordStart R5
#define srcPtr1 R6
#define srcPtr2 R7
#define srcPtr3 R8
#define srcPtr4 R9
#define wordPtr R10
MOVD dig+0(FP), digPtr
MOVD p+8(FP), srcPtrPtr
2024-09-04 16:34:43 +08:00
MOVD buffer+16(FP), wordStart
MOVD blocks+24(FP), blockCount
// load state
MOVD digPtr, digSave
MOVD.P 8(digPtr), R20
2024-09-04 16:34:43 +08:00
VLD1 (R20), [a.S4, e.S4]
MOVD.P 8(digPtr), R20
2024-09-04 16:34:43 +08:00
VLD1 (R20), [b.S4, f.S4]
MOVD.P 8(digPtr), R20
2024-09-04 16:34:43 +08:00
VLD1 (R20), [c.S4, g.S4]
MOVD (digPtr), R20
2024-09-04 16:34:43 +08:00
VLD1 (R20), [d.S4, h.S4]
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
MOVD.P 8(srcPtrPtr), srcPtr1
MOVD.P 8(srcPtrPtr), srcPtr2
MOVD.P 8(srcPtrPtr), srcPtr3
MOVD (srcPtrPtr), srcPtr4
loop:
2024-09-04 16:34:43 +08:00
// save state
VMOV a.B16, aSave.B16
VMOV b.B16, bSave.B16
VMOV c.B16, cSave.B16
VMOV d.B16, dSave.B16
VMOV e.B16, eSave.B16
VMOV f.B16, fSave.B16
VMOV g.B16, gSave.B16
VMOV h.B16, hSave.B16
// reset wordPtr
MOVD wordStart, wordPtr
// load message block
prepare4Words
prepare4Words
prepare4Words
prepare4Words
ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
2024-09-04 16:34:43 +08:00
VEOR a.B16, aSave.B16, a.B16
VEOR b.B16, bSave.B16, b.B16
VEOR c.B16, cSave.B16, c.B16
VEOR d.B16, dSave.B16, d.B16
VEOR e.B16, eSave.B16, e.B16
VEOR f.B16, fSave.B16, f.B16
VEOR g.B16, gSave.B16, g.B16
VEOR h.B16, hSave.B16, h.B16
SUB $1, blockCount
CBNZ blockCount, loop
// transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
MOVD.P 8(digSave), R20
2024-09-04 16:34:43 +08:00
VST1 [a.S4, e.S4], (R20)
MOVD.P 8(digSave), R20
2024-09-04 16:34:43 +08:00
VST1 [b.S4, f.S4], (R20)
MOVD.P 8(digSave), R20
2024-09-04 16:34:43 +08:00
VST1 [c.S4, g.S4], (R20)
MOVD (digSave), R20
2024-09-04 16:34:43 +08:00
VST1 [d.S4, h.S4], (R20)
RET
#undef digPtr
2024-09-04 16:44:57 +08:00
#undef a
#undef b
#undef c
#undef d
#undef e
#undef f
#undef g
#undef h
#define a V0
#define b V1
#define c V2
#define d V3
#define e V4
#define f V5
#define g V6
#define h V7
// func copyResultsBy4(dig *uint32, dst *byte)
TEXT ·copyResultsBy4(SB),NOSPLIT,$0
#define digPtr R0
#define dstPtr R1
MOVD dig+0(FP), digPtr
MOVD dst+8(FP), dstPtr
// load state
VLD1.P 64(digPtr), [a.S4, b.S4, c.S4, d.S4]
VLD1 (digPtr), [e.S4, f.S4, g.S4, h.S4]
VREV32 a.B16, a.B16
VREV32 b.B16, b.B16
VREV32 c.B16, c.B16
VREV32 d.B16, d.B16
VREV32 e.B16, e.B16
VREV32 f.B16, f.B16
VREV32 g.B16, g.B16
VREV32 h.B16, h.B16
VST1.P [a.B16, b.B16, c.B16, d.B16], 64(dstPtr)
VST1 [e.B16, f.B16, g.B16, h.B16], (dstPtr)
RET