sm3: kdf arm64 optimization

This commit is contained in:
Sun Yimin 2024-09-04 16:34:43 +08:00 committed by GitHub
parent bda03f4f43
commit 8e050f1064
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 48 additions and 48 deletions

View File

@ -8,12 +8,12 @@
#include "sm3_const_asm.s" #include "sm3_const_asm.s"
#define a V0 #define a V0
#define b V1 #define e V1
#define c V2 #define b V2
#define d V3 #define f V3
#define e V4 #define c V4
#define f V5 #define g V5
#define g V6 #define d V6
#define h V7 #define h V7
#define tmp1 V8 #define tmp1 V8
@ -21,6 +21,15 @@
#define tmp3 V10 #define tmp3 V10
#define tmp4 V11 #define tmp4 V11
#define aSave V24
#define bSave V25
#define cSave V26
#define dSave V27
#define eSave V28
#define fSave V29
#define gSave V30
#define hSave V31
// input: from high to low // input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0 // t0 = t0.S3, t0.S2, t0.S1, t0.S0
// t1 = t1.S3, t1.S2, t1.S1, t1.S0 // t1 = t1.S3, t1.S2, t1.S1, t1.S0
@ -157,7 +166,6 @@
TEXT ·blockMultBy4(SB), NOSPLIT, $0 TEXT ·blockMultBy4(SB), NOSPLIT, $0
#define digPtr R0 #define digPtr R0
#define srcPtrPtr R1 #define srcPtrPtr R1
#define statePtr R2
#define blockCount R3 #define blockCount R3
#define digSave R4 #define digSave R4
#define wordStart R5 #define wordStart R5
@ -168,39 +176,40 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
#define wordPtr R10 #define wordPtr R10
MOVD dig+0(FP), digPtr MOVD dig+0(FP), digPtr
MOVD p+8(FP), srcPtrPtr MOVD p+8(FP), srcPtrPtr
MOVD buffer+16(FP), statePtr MOVD buffer+16(FP), wordStart
MOVD blocks+24(FP), blockCount MOVD blocks+24(FP), blockCount
// load state // load state
MOVD digPtr, digSave MOVD digPtr, digSave
MOVD.P 8(digPtr), R20 MOVD.P 8(digPtr), R20
VLD1.P 16(R20), [a.S4] VLD1 (R20), [a.S4, e.S4]
VLD1 (R20), [e.S4]
MOVD.P 8(digPtr), R20 MOVD.P 8(digPtr), R20
VLD1.P 16(R20), [b.S4] VLD1 (R20), [b.S4, f.S4]
VLD1 (R20), [f.S4]
MOVD.P 8(digPtr), R20 MOVD.P 8(digPtr), R20
VLD1.P 16(R20), [c.S4] VLD1 (R20), [c.S4, g.S4]
VLD1 (R20), [g.S4]
MOVD (digPtr), R20 MOVD (digPtr), R20
VLD1.P 16(R20), [d.S4] VLD1 (R20), [d.S4, h.S4]
VLD1 (R20), [h.S4]
// transpose state // transpose state
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4) TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
// store state to temporary buffer
MOVD statePtr, wordStart
VST1.P [a.S4, b.S4, c.S4, d.S4], 64(wordStart)
VST1.P [e.S4, f.S4, g.S4, h.S4], 64(wordStart)
MOVD.P 8(srcPtrPtr), srcPtr1 MOVD.P 8(srcPtrPtr), srcPtr1
MOVD.P 8(srcPtrPtr), srcPtr2 MOVD.P 8(srcPtrPtr), srcPtr2
MOVD.P 8(srcPtrPtr), srcPtr3 MOVD.P 8(srcPtrPtr), srcPtr3
MOVD (srcPtrPtr), srcPtr4 MOVD (srcPtrPtr), srcPtr4
loop: loop:
// save state
VMOV a.B16, aSave.B16
VMOV b.B16, bSave.B16
VMOV c.B16, cSave.B16
VMOV d.B16, dSave.B16
VMOV e.B16, eSave.B16
VMOV f.B16, fSave.B16
VMOV g.B16, gSave.B16
VMOV h.B16, hSave.B16
// reset wordPtr // reset wordPtr
MOVD wordStart, wordPtr MOVD wordStart, wordPtr
@ -277,20 +286,14 @@ loop:
ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
MOVD statePtr, R20 VEOR a.B16, aSave.B16, a.B16
VLD1.P 64(R20), [V8.S4, V9.S4, V10.S4, V11.S4] VEOR b.B16, bSave.B16, b.B16
VLD1 (R20), [V12.S4, V13.S4, V14.S4, V15.S4] VEOR c.B16, cSave.B16, c.B16
VEOR a.B16, V8.B16, a.B16 VEOR d.B16, dSave.B16, d.B16
VEOR b.B16, V9.B16, b.B16 VEOR e.B16, eSave.B16, e.B16
VEOR c.B16, V10.B16, c.B16 VEOR f.B16, fSave.B16, f.B16
VEOR d.B16, V11.B16, d.B16 VEOR g.B16, gSave.B16, g.B16
VEOR e.B16, V12.B16, e.B16 VEOR h.B16, hSave.B16, h.B16
VEOR f.B16, V13.B16, f.B16
VEOR g.B16, V14.B16, g.B16
VEOR h.B16, V15.B16, h.B16
MOVD statePtr, R20
VST1.P [a.S4, b.S4, c.S4, d.S4], 64(R20)
VST1 [e.S4, f.S4, g.S4, h.S4], (R20)
SUB $1, blockCount SUB $1, blockCount
CBNZ blockCount, loop CBNZ blockCount, loop
@ -300,17 +303,13 @@ loop:
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
MOVD.P 8(digSave), R20 MOVD.P 8(digSave), R20
VST1.P [a.S4], 16(R20) VST1 [a.S4, e.S4], (R20)
VST1 [e.S4], (R20)
MOVD.P 8(digSave), R20 MOVD.P 8(digSave), R20
VST1.P [b.S4], 16(R20) VST1 [b.S4, f.S4], (R20)
VST1 [f.S4], (R20)
MOVD.P 8(digSave), R20 MOVD.P 8(digSave), R20
VST1.P [c.S4], 16(R20) VST1 [c.S4, g.S4], (R20)
VST1 [g.S4], (R20)
MOVD (digSave), R20 MOVD (digSave), R20
VST1.P [d.S4], 16(R20) VST1 [d.S4, h.S4], (R20)
VST1 [h.S4], (R20)
RET RET

View File

@ -74,7 +74,7 @@ GLOBL mask<>(SB), 8, $64
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \ #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
PROLD(a, TMP0, 12) \ PROLD(a, TMP0, 12) \
VLR TMP0, TMP1 \ VLR TMP0, TMP1 \
VLREPF (index*4)(R3), TMP2 \ VLREPF (index*4)(kPtr), TMP2 \ // It seems that the VREPIF instruction is not supported yet.
VAF TMP2, TMP0, TMP0 \ VAF TMP2, TMP0, TMP0 \
VAF e, TMP0, TMP0 \ VAF e, TMP0, TMP0 \
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1
@ -129,7 +129,7 @@ GLOBL mask<>(SB), 8, $64
MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it
PROLD(a, TMP0, 12) \ PROLD(a, TMP0, 12) \
VLR TMP0, TMP4 \ VLR TMP0, TMP4 \
VLREPF (index*4)(R3), TMP2 \ VLREPF (index*4)(kPtr), TMP2 \ // It seems that the VREPIF instruction is not supported yet.
VAF TMP2, TMP0, TMP0 \ VAF TMP2, TMP0, TMP0 \
VAF e, TMP0, TMP0 \ VAF e, TMP0, TMP0 \
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1
@ -166,7 +166,6 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0
MOVD dig+0(FP), digPtr MOVD dig+0(FP), digPtr
MOVD dst+8(FP), dstPtr MOVD dst+8(FP), dstPtr
// load state
VLM (digPtr), V0, V7 VLM (digPtr), V0, V7
VSTM V0, V7, (dstPtr) VSTM V0, V7, (dstPtr)
@ -174,11 +173,13 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0
#undef digPtr #undef digPtr
#undef dstPtr #undef dstPtr
// Used general purpose registers R1-R11.
// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB), NOSPLIT, $0 TEXT ·blockMultBy4(SB), NOSPLIT, $0
#define digPtr R11 #define digPtr R11
#define srcPtrPtr R1 #define srcPtrPtr R1
#define statePtr R2 #define statePtr R2
#define kPtr R3
#define blockCount R5 #define blockCount R5
#define srcPtr1 R6 #define srcPtr1 R6
#define srcPtr2 R7 #define srcPtr2 R7
@ -212,7 +213,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD 24(srcPtrPtr), srcPtr4 MOVD 24(srcPtrPtr), srcPtr4
MOVD $0, srcPtrPtr MOVD $0, srcPtrPtr
MOVD $·_K+0(SB), R3 MOVD $·_K+0(SB), kPtr
loop: loop:
// save state // save state