mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-25 03:36:18 +08:00
312 lines
11 KiB
ArmAsm
312 lines
11 KiB
ArmAsm
//go:build !purego
|
|
|
|
#include "textflag.h"
|
|
#include "sm3_const_asm.s"
|
|
|
|
#define a V0
|
|
#define b V1
|
|
#define c V2
|
|
#define d V3
|
|
#define e V4
|
|
#define f V5
|
|
#define g V6
|
|
#define h V7
|
|
|
|
#define tmp1 V8
|
|
#define tmp2 V9
|
|
#define tmp3 V10
|
|
#define tmp4 V11
|
|
|
|
// input: from high to low
|
|
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
|
|
// t1 = t1.S3, t1.S2, t1.S1, t1.S0
|
|
// t2 = t2.S3, t2.S2, t2.S1, t2.S0
|
|
// t3 = t3.S3, t3.S2, t3.S1, t3.S0
|
|
// output: from high to low
|
|
// t0 = t3.S0, t2.S0, t1.S0, t0.S0
|
|
// t1 = t3.S1, t2.S1, t1.S1, t0.S1
|
|
// t2 = t3.S2, t2.S2, t1.S2, t0.S2
|
|
// t3 = t3.S3, t2.S3, t1.S3, t0.S3
|
|
#define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
|
|
VZIP1 t1.S4, t0.S4, RTMP0.S4 \
|
|
VZIP1 t3.S4, t2.S4, RTMP1.S4 \
|
|
VZIP2 t1.S4, t0.S4, RTMP2.S4 \
|
|
VZIP2 t3.S4, t2.S4, RTMP3.S4 \
|
|
VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \
|
|
VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \
|
|
VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \
|
|
VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 \
|
|
|
|
// r = s <<< n
|
|
#define PROLD(s, r, n) \
|
|
VSHL $(n), s.S4, r.S4 \
|
|
VSRI $(32-n), s.S4, r.S4 \
|
|
|
|
#define loadWordByIndex(W, i) \
|
|
ADD $(16*(i)), wordStart, R20 \
|
|
VLD1 (R20), [W.S4] \
|
|
|
|
#define prepare4Words \
|
|
VLD1.P 16(srcPtr1), [V12.B16] \
|
|
VLD1.P 16(srcPtr2), [V13.B16] \
|
|
VLD1.P 16(srcPtr3), [V14.B16] \
|
|
VLD1.P 16(srcPtr4), [V15.B16] \
|
|
TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \
|
|
VREV32 V12.B16, V12.B16; \
|
|
VREV32 V13.B16, V13.B16; \
|
|
VREV32 V14.B16, V14.B16; \
|
|
VREV32 V15.B16, V15.B16; \
|
|
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr)
|
|
|
|
#define LOAD_T(const, T) \
|
|
MOVW $const, R20 \
|
|
VDUP R20, T.S4 \
|
|
|
|
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
|
|
PROLD(a, V12, 12) \
|
|
VMOV V12.B16, V13.B16 \
|
|
LOAD_T(const, tmp1) \
|
|
VADD tmp1.S4, V12.S4, V12.S4 \
|
|
VADD e.S4, V12.S4, V12.S4 \
|
|
PROLD(V12, V14, 7) \ // V14 = SS1
|
|
VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2
|
|
VEOR a.B16, b.B16, V13.B16 \
|
|
VEOR c.B16, V13.B16, V13.B16 \
|
|
VADD V13.S4, d.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d
|
|
loadWordByIndex(V10, index) \
|
|
loadWordByIndex(V11, index+4) \
|
|
VEOR V10.B16, V11.B16, V11.B16 \
|
|
VADD V11.S4, V13.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
|
|
VADD V13.S4, V12.S4, V13.S4 \ // TT1
|
|
VADD h.S4, V10.S4, V10.S4 \
|
|
VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1
|
|
VEOR e.B16, f.B16, V11.B16 \
|
|
VEOR g.B16, V11.B16, V11.B16 \
|
|
VADD V11.S4, V10.S4, V10.S4 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
|
VMOV b.B16, V11.B16 \
|
|
PROLD(V11, b, 9) \ // b = b <<< 9
|
|
VMOV V13.B16, h.B16 \ // h = TT1
|
|
VMOV f.B16, V11.B16 \
|
|
PROLD(V11, f, 19) \ // f = f <<< 19
|
|
PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9
|
|
PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17
|
|
VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)
|
|
VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
|
|
|
|
#define MESSAGE_SCHEDULE(index) \
|
|
loadWordByIndex(V10, index+1) \ // Wj-3
|
|
PROLD(V10, V11, 15) \
|
|
loadWordByIndex(V10, index-12) \ // Wj-16
|
|
VEOR V10.B16, V11.B16, V10.B16 \
|
|
loadWordByIndex(V11, index-5) \ // Wj-9
|
|
VEOR V10.B16, V11.B16, V10.B16 \
|
|
PROLD(V10, V11, 15) \
|
|
PROLD(V11, V12, 8) \
|
|
VEOR V11.B16, V10.B16, V10.B16 \
|
|
VEOR V12.B16, V10.B16, V10.B16 \ // P1
|
|
loadWordByIndex(V11, index-9) \ // Wj-13
|
|
PROLD(V11, V12, 7) \
|
|
VEOR V12.B16, V10.B16, V10.B16 \
|
|
loadWordByIndex(V11, index-2) \ // Wj-6
|
|
VEOR V11.B16, V10.B16, V11.B16 \
|
|
VST1.P [V11.S4], 16(wordPtr) \
|
|
|
|
#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
|
|
MESSAGE_SCHEDULE(index) \
|
|
ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
|
|
|
|
#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
|
|
MESSAGE_SCHEDULE(index) \ // V11 is Wt+4 now, Pls do not use it
|
|
PROLD(a, V12, 12) \
|
|
VMOV V12.B16, V13.B16 \
|
|
LOAD_T(const, tmp1) \
|
|
VADD tmp1.S4, V12.S4, V12.S4 \
|
|
VADD e.S4, V12.S4, V12.S4 \
|
|
PROLD(V12, V14, 7) \ // V14 = SS1
|
|
VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2
|
|
VORR a.B16, b.B16, V10.B16 \
|
|
VAND a.B16, b.B16, V13.B16 \
|
|
VAND c.B16, V10.B16, V10.B16 \
|
|
VORR V13.B16, V10.B16, V13.B16 \ // (a AND b) OR (a AND c) OR (b AND c)
|
|
VADD V13.S4, d.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
|
loadWordByIndex(V10, index) \ // Wj
|
|
VEOR V10.B16, V11.B16, V11.B16 \ // Wj XOR Wj+4
|
|
VADD V13.S4, V11.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
|
|
VADD V13.S4, V12.S4, V13.S4 \ // TT1
|
|
VADD h.S4, V10.S4, V10.S4 \ // Wt + h
|
|
VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1
|
|
VEOR f.B16, g.B16, V11.B16 \
|
|
VAND V11.B16, e.B16, V11.B16 \
|
|
VEOR g.B16, V11.B16, V11.B16 \ // (f XOR g) AND e XOR g
|
|
VADD V10.S4, V11.S4, V10.S4 \ // TT2
|
|
VMOV b.B16, V11.B16 \
|
|
PROLD(V11, b, 9) \ // b = b <<< 9
|
|
VMOV V13.B16, h.B16 \ // h = TT1
|
|
VMOV f.B16, V11.B16 \
|
|
PROLD(V11, f, 19) \ // f = f <<< 19
|
|
PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9
|
|
PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17
|
|
VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9)
|
|
VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
|
|
|
|
// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
|
|
TEXT ·blockMultBy4(SB), NOSPLIT, $0
|
|
#define digPtr R0
|
|
#define srcPtrPtr R1
|
|
#define statePtr R2
|
|
#define blockCount R3
|
|
#define digSave R4
|
|
#define wordStart R5
|
|
#define srcPtr1 R6
|
|
#define srcPtr2 R7
|
|
#define srcPtr3 R8
|
|
#define srcPtr4 R9
|
|
#define wordPtr R10
|
|
MOVD dig+0(FP), digPtr
|
|
MOVD p+8(FP), srcPtrPtr
|
|
MOVD buffer+16(FP), statePtr
|
|
MOVD blocks+24(FP), blockCount
|
|
|
|
// load state
|
|
MOVD digPtr, digSave
|
|
MOVD.P 8(digPtr), R20
|
|
VLD1.P 16(R20), [a.S4]
|
|
VLD1 (R20), [e.S4]
|
|
MOVD.P 8(digPtr), R20
|
|
VLD1.P 16(R20), [b.S4]
|
|
VLD1 (R20), [f.S4]
|
|
MOVD.P 8(digPtr), R20
|
|
VLD1.P 16(R20), [c.S4]
|
|
VLD1 (R20), [g.S4]
|
|
MOVD (digPtr), R20
|
|
VLD1.P 16(R20), [d.S4]
|
|
VLD1 (R20), [h.S4]
|
|
|
|
// transpose state
|
|
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
|
|
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
|
|
|
|
// store state to temporary buffer
|
|
MOVD statePtr, wordStart
|
|
VST1.P [a.S4, b.S4, c.S4, d.S4], 64(wordStart)
|
|
VST1.P [e.S4, f.S4, g.S4, h.S4], 64(wordStart)
|
|
|
|
MOVD.P 8(srcPtrPtr), srcPtr1
|
|
MOVD.P 8(srcPtrPtr), srcPtr2
|
|
MOVD.P 8(srcPtrPtr), srcPtr3
|
|
MOVD (srcPtrPtr), srcPtr4
|
|
|
|
loop:
|
|
// reset wordPtr
|
|
MOVD wordStart, wordPtr
|
|
|
|
// load message block
|
|
prepare4Words
|
|
prepare4Words
|
|
prepare4Words
|
|
prepare4Words
|
|
|
|
ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
|
|
ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
|
|
ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
|
|
ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
|
|
ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
|
|
ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
|
|
ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
|
|
ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
|
|
ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
|
|
ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
|
|
ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
|
|
ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
|
|
|
|
ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
|
|
ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
|
|
ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
|
|
ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
|
|
|
|
ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
|
|
ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
|
|
ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
|
|
ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
|
|
ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
|
|
ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
|
|
ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
|
|
ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
|
|
ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
|
|
|
|
MOVD statePtr, R20
|
|
VLD1.P 64(R20), [V8.S4, V9.S4, V10.S4, V11.S4]
|
|
VLD1 (R20), [V12.S4, V13.S4, V14.S4, V15.S4]
|
|
VEOR a.B16, V8.B16, a.B16
|
|
VEOR b.B16, V9.B16, b.B16
|
|
VEOR c.B16, V10.B16, c.B16
|
|
VEOR d.B16, V11.B16, d.B16
|
|
VEOR e.B16, V12.B16, e.B16
|
|
VEOR f.B16, V13.B16, f.B16
|
|
VEOR g.B16, V14.B16, g.B16
|
|
VEOR h.B16, V15.B16, h.B16
|
|
MOVD statePtr, R20
|
|
VST1.P [a.S4, b.S4, c.S4, d.S4], 64(R20)
|
|
VST1 [e.S4, f.S4, g.S4, h.S4], (R20)
|
|
|
|
SUB $1, blockCount
|
|
CBNZ blockCount, loop
|
|
|
|
// transpose state
|
|
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
|
|
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
|
|
|
|
MOVD.P 8(digSave), R20
|
|
VST1.P [a.S4], 16(R20)
|
|
VST1 [e.S4], (R20)
|
|
MOVD.P 8(digSave), R20
|
|
VST1.P [b.S4], 16(R20)
|
|
VST1 [f.S4], (R20)
|
|
MOVD.P 8(digSave), R20
|
|
VST1.P [c.S4], 16(R20)
|
|
VST1 [g.S4], (R20)
|
|
MOVD (digSave), R20
|
|
VST1.P [d.S4], 16(R20)
|
|
VST1 [h.S4], (R20)
|
|
|
|
RET
|