// Copyright 2024 Sun Yimin. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. //go:build !purego #include "textflag.h" #include "go_asm.h" #include "sm3_const_asm.s" DATA mask<>+0x00(SB)/8, $0x0001020310111213 DATA mask<>+0x08(SB)/8, $0x0405060714151617 DATA mask<>+0x10(SB)/8, $0x08090a0b18191a1b DATA mask<>+0x18(SB)/8, $0x0c0d0e0f1c1d1e1f DATA mask<>+0x20(SB)/8, $0x0001020304050607 DATA mask<>+0x28(SB)/8, $0x1011121314151617 DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f GLOBL mask<>(SB), 8, $64 #define a V0 #define e V1 #define b V2 #define f V3 #define c V4 #define g V5 #define d V6 #define h V7 #define M0 V8 #define M1 V9 #define M2 V10 #define M3 V11 #define TMP0 V12 #define TMP1 V13 #define TMP2 V14 #define TMP3 V15 #define TMP4 V16 #define aSave V24 #define bSave V25 #define cSave V26 #define dSave V27 #define eSave V28 #define fSave V29 #define gSave V30 #define hSave V31 #define TRANSPOSE_MATRIX(T0, T1, T2, T3, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) \ VPERM T0, T1, M0, TMP0; \ VPERM T2, T3, M0, TMP1; \ VPERM T0, T1, M1, TMP2; \ VPERM T2, T3, M1, TMP3; \ VPERM TMP0, TMP1, M2, T0; \ VPERM TMP0, TMP1, M3, T1; \ VPERM TMP2, TMP3, M2, T2; \ VPERM TMP2, TMP3, M3, T3 // r = s <<< n #define PROLD(s, r, n) \ VERLLF $n, s, r #define loadWordByIndex(W, i) \ VL (16*i)(statePtr), W // one word is 16 bytes #define prepare4Words \ VL (srcPtr1)(srcPtrPtr*1), V16; \ VL (srcPtr2)(srcPtrPtr*1), V17; \ VL (srcPtr3)(srcPtrPtr*1), V18; \ VL (srcPtr4)(srcPtrPtr*1), V19; \ TRANSPOSE_MATRIX(V16, V17, V18, V19, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3); \ VSTM V16, V19, (wordPtr); \ LAY 16(srcPtrPtr), srcPtrPtr; \ ADD $64, wordPtr #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ PROLD(a, TMP0, 12) \ VLR TMP0, TMP1 \ VLREPF (index*4)(R3), TMP2 \ VAF TMP2, TMP0, TMP0 \ VAF e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 VX TMP2, TMP1, TMP0 \ // TMP0 = SS2 VX a, b, TMP1 \ VX c, TMP1, TMP1 \ VAF TMP1, d, TMP1 \ // TMP1 = (a XOR b XOR c) + d loadWordByIndex(TMP3, index) \ loadWordByIndex(TMP4, index+4) \ VX TMP3, TMP4, TMP4 \ VAF TMP4, TMP1, TMP1 \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4) VAF TMP1, TMP0, TMP1 \ // TMP1 = TT1 VAF h, TMP3, TMP3 \ VAF TMP3, TMP2, TMP3 \ // Wt + h + SS1 VX e, f, TMP4 \ VX g, TMP4, TMP4 \ VAF TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 VLR b, TMP4 \ PROLD(TMP4, b, 9) \ // b = b <<< 9 VLR TMP1, h \ // h = TT1 VLR f, TMP4 \ PROLD(TMP4, f, 19) \ // f = f <<< 19 PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 VX TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) VX TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) #define MESSAGE_SCHEDULE(index) \ loadWordByIndex(TMP0, index+1) \ // Wj-3 PROLD(TMP0, TMP1, 15) \ loadWordByIndex(TMP0, index-12) \ // Wj-16 VX TMP0, TMP1, TMP0 \ loadWordByIndex(TMP1, index-5) \ // Wj-9 VX TMP0, TMP1, TMP0 \ PROLD(TMP0, TMP1, 15) \ PROLD(TMP1, TMP2, 8) \ VX TMP1, TMP0, TMP0 \ VX TMP2, TMP0, TMP0 \ // P1 loadWordByIndex(TMP1, index-9) \ // Wj-13 PROLD(TMP1, TMP2, 7) \ VX TMP2, TMP0, TMP0 \ loadWordByIndex(TMP1, index-2) \ // Wj-6 VX TMP1, TMP0, TMP1 \ VST TMP1, (wordPtr) \ ADD $16, wordPtr \ #define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \ MESSAGE_SCHEDULE(index) \ ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ #define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it PROLD(a, TMP0, 12) \ VLR TMP0, TMP4 \ VLREPF (index*4)(R3), TMP2 \ VAF TMP2, TMP0, TMP0 \ VAF e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 VX TMP2, TMP4, TMP0 \ // TMP0 = SS2 VO a, b, TMP3 \ VN a, b, TMP4 \ VN c, TMP3, TMP3 \ VO TMP4, TMP3, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) VAF TMP4, d, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d loadWordByIndex(TMP3, index) \ // Wj VX TMP3, TMP1, TMP1 \ // Wj XOR Wj+4 VAF TMP4, TMP1, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) VAF TMP4, TMP0, TMP4 \ // TT1 VAF h, TMP3, TMP3 \ // Wt + h VAF TMP2, TMP3, TMP3 \ // Wt + h + SS1 VX f, g, TMP1 \ VN TMP1, e, TMP1 \ VX g, TMP1, TMP1 \ // (f XOR g) AND e XOR g VAF TMP3, TMP1, TMP3 \ // TT2 VLR b, TMP1 \ PROLD(TMP1, b, 9) \ // b = b <<< 9 VLR TMP4, h \ // h = TT1 VLR f, TMP1 \ PROLD(TMP1, f, 19) \ // f = f <<< 19 PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 VX TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) VX TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) // func copyResultsBy4(dig *uint32, dst *byte) TEXT ·copyResultsBy4(SB),NOSPLIT,$0 #define digPtr R3 #define dstPtr R4 MOVD dig+0(FP), digPtr MOVD dst+8(FP), dstPtr // load state VLM (digPtr), V0, V7 VSTM V0, V7, (dstPtr) RET #undef digPtr #undef dstPtr // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) TEXT ·blockMultBy4(SB), NOSPLIT, $0 #define digPtr R11 #define srcPtrPtr R1 #define statePtr R2 #define blockCount R5 #define srcPtr1 R6 #define srcPtr2 R7 #define srcPtr3 R8 #define srcPtr4 R9 #define wordPtr R10 MOVD dig+0(FP), digPtr MOVD p+8(FP), srcPtrPtr MOVD buffer+16(FP), statePtr MOVD blocks+24(FP), blockCount // load state MOVD 0(digPtr), R4 VLM (R4), a, e MOVD 8(digPtr), R4 VLM (R4), b, f MOVD 16(digPtr), R4 VLM (R4), c, g MOVD 24(digPtr), R4 VLM (R4), d, h MOVD $mask<>+0x00(SB), R4 VLM (R4), M0, M3 TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) MOVD (srcPtrPtr), srcPtr1 MOVD 8(srcPtrPtr), srcPtr2 MOVD 16(srcPtrPtr), srcPtr3 MOVD 24(srcPtrPtr), srcPtr4 MOVD $0, srcPtrPtr MOVD $·_K+0(SB), R3 loop: // save state VLR a, aSave VLR b, bSave VLR c, cSave VLR d, dSave VLR e, eSave VLR f, fSave VLR g, gSave VLR h, hSave // reset wordPtr MOVD statePtr, wordPtr // load message block prepare4Words prepare4Words prepare4Words prepare4Words ROUND_00_11(0, T0, a, b, c, d, e, f, g, h) ROUND_00_11(1, T1, h, a, b, c, d, e, f, g) ROUND_00_11(2, T2, g, h, a, b, c, d, e, f) ROUND_00_11(3, T3, f, g, h, a, b, c, d, e) ROUND_00_11(4, T4, e, f, g, h, a, b, c, d) ROUND_00_11(5, T5, d, e, f, g, h, a, b, c) ROUND_00_11(6, T6, c, d, e, f, g, h, a, b) ROUND_00_11(7, T7, b, c, d, e, f, g, h, a) ROUND_00_11(8, T8, a, b, c, d, e, f, g, h) ROUND_00_11(9, T9, h, a, b, c, d, e, f, g) ROUND_00_11(10, T10, g, h, a, b, c, d, e, f) ROUND_00_11(11, T11, f, g, h, a, b, c, d, e) ROUND_12_15(12, T12, e, f, g, h, a, b, c, d) ROUND_12_15(13, T13, d, e, f, g, h, a, b, c) ROUND_12_15(14, T14, c, d, e, f, g, h, a, b) ROUND_12_15(15, T15, b, c, d, e, f, g, h, a) ROUND_16_63(16, T16, a, b, c, d, e, f, g, h) ROUND_16_63(17, T17, h, a, b, c, d, e, f, g) ROUND_16_63(18, T18, g, h, a, b, c, d, e, f) ROUND_16_63(19, T19, f, g, h, a, b, c, d, e) ROUND_16_63(20, T20, e, f, g, h, a, b, c, d) ROUND_16_63(21, T21, d, e, f, g, h, a, b, c) ROUND_16_63(22, T22, c, d, e, f, g, h, a, b) ROUND_16_63(23, T23, b, c, d, e, f, g, h, a) ROUND_16_63(24, T24, a, b, c, d, e, f, g, h) ROUND_16_63(25, T25, h, a, b, c, d, e, f, g) ROUND_16_63(26, T26, g, h, a, b, c, d, e, f) ROUND_16_63(27, T27, f, g, h, a, b, c, d, e) ROUND_16_63(28, T28, e, f, g, h, a, b, c, d) ROUND_16_63(29, T29, d, e, f, g, h, a, b, c) ROUND_16_63(30, T30, c, d, e, f, g, h, a, b) ROUND_16_63(31, T31, b, c, d, e, f, g, h, a) ROUND_16_63(32, T32, a, b, c, d, e, f, g, h) ROUND_16_63(33, T33, h, a, b, c, d, e, f, g) ROUND_16_63(34, T34, g, h, a, b, c, d, e, f) ROUND_16_63(35, T35, f, g, h, a, b, c, d, e) ROUND_16_63(36, T36, e, f, g, h, a, b, c, d) ROUND_16_63(37, T37, d, e, f, g, h, a, b, c) ROUND_16_63(38, T38, c, d, e, f, g, h, a, b) ROUND_16_63(39, T39, b, c, d, e, f, g, h, a) ROUND_16_63(40, T40, a, b, c, d, e, f, g, h) ROUND_16_63(41, T41, h, a, b, c, d, e, f, g) ROUND_16_63(42, T42, g, h, a, b, c, d, e, f) ROUND_16_63(43, T43, f, g, h, a, b, c, d, e) ROUND_16_63(44, T44, e, f, g, h, a, b, c, d) ROUND_16_63(45, T45, d, e, f, g, h, a, b, c) ROUND_16_63(46, T46, c, d, e, f, g, h, a, b) ROUND_16_63(47, T47, b, c, d, e, f, g, h, a) ROUND_16_63(48, T16, a, b, c, d, e, f, g, h) ROUND_16_63(49, T17, h, a, b, c, d, e, f, g) ROUND_16_63(50, T18, g, h, a, b, c, d, e, f) ROUND_16_63(51, T19, f, g, h, a, b, c, d, e) ROUND_16_63(52, T20, e, f, g, h, a, b, c, d) ROUND_16_63(53, T21, d, e, f, g, h, a, b, c) ROUND_16_63(54, T22, c, d, e, f, g, h, a, b) ROUND_16_63(55, T23, b, c, d, e, f, g, h, a) ROUND_16_63(56, T24, a, b, c, d, e, f, g, h) ROUND_16_63(57, T25, h, a, b, c, d, e, f, g) ROUND_16_63(58, T26, g, h, a, b, c, d, e, f) ROUND_16_63(59, T27, f, g, h, a, b, c, d, e) ROUND_16_63(60, T28, e, f, g, h, a, b, c, d) ROUND_16_63(61, T29, d, e, f, g, h, a, b, c) ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) VX a, aSave, a VX b, bSave, b VX c, cSave, c VX d, dSave, d VX e, eSave, e VX f, fSave, f VX g, gSave, g VX h, hSave, h SUB $1, blockCount CMPBGT blockCount, $0, loop TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) MOVD 0(digPtr), R4 VSTM a, e, (R4) MOVD 8(digPtr), R4 VSTM b, f, (R4) MOVD 16(digPtr), R4 VSTM c, g, (R4) MOVD 24(digPtr), R4 VSTM d, h, (R4) VLREPF (0)(R3), TMP2 VST TMP2, (R4) VERLLF $8, TMP2, TMP2 VST TMP2, 16(R4) RET