diff --git a/sm3/sm3blocks_arm64.s b/sm3/sm3blocks_arm64.s index 9647598..5014cf9 100644 --- a/sm3/sm3blocks_arm64.s +++ b/sm3/sm3blocks_arm64.s @@ -8,12 +8,12 @@ #include "sm3_const_asm.s" #define a V0 -#define b V1 -#define c V2 -#define d V3 -#define e V4 -#define f V5 -#define g V6 +#define e V1 +#define b V2 +#define f V3 +#define c V4 +#define g V5 +#define d V6 #define h V7 #define tmp1 V8 @@ -21,6 +21,15 @@ #define tmp3 V10 #define tmp4 V11 +#define aSave V24 +#define bSave V25 +#define cSave V26 +#define dSave V27 +#define eSave V28 +#define fSave V29 +#define gSave V30 +#define hSave V31 + // input: from high to low // t0 = t0.S3, t0.S2, t0.S1, t0.S0 // t1 = t1.S3, t1.S2, t1.S1, t1.S0 @@ -157,7 +166,6 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0 #define digPtr R0 #define srcPtrPtr R1 -#define statePtr R2 #define blockCount R3 #define digSave R4 #define wordStart R5 @@ -168,39 +176,40 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0 #define wordPtr R10 MOVD dig+0(FP), digPtr MOVD p+8(FP), srcPtrPtr - MOVD buffer+16(FP), statePtr + MOVD buffer+16(FP), wordStart MOVD blocks+24(FP), blockCount // load state MOVD digPtr, digSave MOVD.P 8(digPtr), R20 - VLD1.P 16(R20), [a.S4] - VLD1 (R20), [e.S4] + VLD1 (R20), [a.S4, e.S4] MOVD.P 8(digPtr), R20 - VLD1.P 16(R20), [b.S4] - VLD1 (R20), [f.S4] + VLD1 (R20), [b.S4, f.S4] MOVD.P 8(digPtr), R20 - VLD1.P 16(R20), [c.S4] - VLD1 (R20), [g.S4] + VLD1 (R20), [c.S4, g.S4] MOVD (digPtr), R20 - VLD1.P 16(R20), [d.S4] - VLD1 (R20), [h.S4] + VLD1 (R20), [d.S4, h.S4] // transpose state TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4) TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) - // store state to temporary buffer - MOVD statePtr, wordStart - VST1.P [a.S4, b.S4, c.S4, d.S4], 64(wordStart) - VST1.P [e.S4, f.S4, g.S4, h.S4], 64(wordStart) - MOVD.P 8(srcPtrPtr), srcPtr1 MOVD.P 8(srcPtrPtr), srcPtr2 MOVD.P 8(srcPtrPtr), srcPtr3 MOVD (srcPtrPtr), srcPtr4 loop: + // save state + VMOV a.B16, aSave.B16 + VMOV b.B16, bSave.B16 + VMOV c.B16, cSave.B16 + VMOV d.B16, dSave.B16 + VMOV e.B16, eSave.B16 + VMOV f.B16, fSave.B16 + VMOV g.B16, gSave.B16 + VMOV h.B16, hSave.B16 + // reset wordPtr MOVD wordStart, wordPtr @@ -277,20 +286,14 @@ loop: ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) - MOVD statePtr, R20 - VLD1.P 64(R20), [V8.S4, V9.S4, V10.S4, V11.S4] - VLD1 (R20), [V12.S4, V13.S4, V14.S4, V15.S4] - VEOR a.B16, V8.B16, a.B16 - VEOR b.B16, V9.B16, b.B16 - VEOR c.B16, V10.B16, c.B16 - VEOR d.B16, V11.B16, d.B16 - VEOR e.B16, V12.B16, e.B16 - VEOR f.B16, V13.B16, f.B16 - VEOR g.B16, V14.B16, g.B16 - VEOR h.B16, V15.B16, h.B16 - MOVD statePtr, R20 - VST1.P [a.S4, b.S4, c.S4, d.S4], 64(R20) - VST1 [e.S4, f.S4, g.S4, h.S4], (R20) + VEOR a.B16, aSave.B16, a.B16 + VEOR b.B16, bSave.B16, b.B16 + VEOR c.B16, cSave.B16, c.B16 + VEOR d.B16, dSave.B16, d.B16 + VEOR e.B16, eSave.B16, e.B16 + VEOR f.B16, fSave.B16, f.B16 + VEOR g.B16, gSave.B16, g.B16 + VEOR h.B16, hSave.B16, h.B16 SUB $1, blockCount CBNZ blockCount, loop @@ -300,17 +303,13 @@ loop: TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) MOVD.P 8(digSave), R20 - VST1.P [a.S4], 16(R20) - VST1 [e.S4], (R20) + VST1 [a.S4, e.S4], (R20) MOVD.P 8(digSave), R20 - VST1.P [b.S4], 16(R20) - VST1 [f.S4], (R20) + VST1 [b.S4, f.S4], (R20) MOVD.P 8(digSave), R20 - VST1.P [c.S4], 16(R20) - VST1 [g.S4], (R20) + VST1 [c.S4, g.S4], (R20) MOVD (digSave), R20 - VST1.P [d.S4], 16(R20) - VST1 [h.S4], (R20) + VST1 [d.S4, h.S4], (R20) RET diff --git a/sm3/sm3blocks_s390x.s b/sm3/sm3blocks_s390x.s index 26bbbe1..b3384d5 100644 --- a/sm3/sm3blocks_s390x.s +++ b/sm3/sm3blocks_s390x.s @@ -74,7 +74,7 @@ GLOBL mask<>(SB), 8, $64 #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \ PROLD(a, TMP0, 12) \ VLR TMP0, TMP1 \ - VLREPF (index*4)(R3), TMP2 \ + VLREPF (index*4)(kPtr), TMP2 \ // It seems that the VREPIF instruction is not supported yet. VAF TMP2, TMP0, TMP0 \ VAF e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 @@ -122,14 +122,14 @@ GLOBL mask<>(SB), 8, $64 ADD $16, wordPtr \ #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ - MESSAGE_SCHEDULE(index) \ + MESSAGE_SCHEDULE(index) \ ROUND_00_11(index, a, b, c, d, e, f, g, h) \ #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \ MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it PROLD(a, TMP0, 12) \ VLR TMP0, TMP4 \ - VLREPF (index*4)(R3), TMP2 \ + VLREPF (index*4)(kPtr), TMP2 \ // It seems that the VREPIF instruction is not supported yet. VAF TMP2, TMP0, TMP0 \ VAF e, TMP0, TMP0 \ PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 @@ -166,7 +166,6 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0 MOVD dig+0(FP), digPtr MOVD dst+8(FP), dstPtr - // load state VLM (digPtr), V0, V7 VSTM V0, V7, (dstPtr) @@ -174,11 +173,13 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0 #undef digPtr #undef dstPtr +// Used general purpose registers R1-R11. // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) TEXT ·blockMultBy4(SB), NOSPLIT, $0 #define digPtr R11 #define srcPtrPtr R1 #define statePtr R2 +#define kPtr R3 #define blockCount R5 #define srcPtr1 R6 #define srcPtr2 R7 @@ -212,7 +213,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0 MOVD 24(srcPtrPtr), srcPtr4 MOVD $0, srcPtrPtr - MOVD $·_K+0(SB), R3 + MOVD $·_K+0(SB), kPtr loop: // save state