mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
sm3: ppc64x, kdf mult by 4
This commit is contained in:
parent
6e8a3cc832
commit
077b115c29
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
//go:build purego || !(amd64 || arm64 || s390x)
|
//go:build purego || !(amd64 || arm64 || s390x || ppc64 || ppc64le)
|
||||||
|
|
||||||
package sm3
|
package sm3
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
//go:build (amd64 || arm64 || s390x) && !purego
|
//go:build (amd64 || arm64 || s390x || ppc64 || ppc64le) && !purego
|
||||||
|
|
||||||
package sm3
|
package sm3
|
||||||
|
|
||||||
|
15
sm3/kdf_ppc64x.go
Normal file
15
sm3/kdf_ppc64x.go
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
// Copyright 2024 Sun Yimin. All rights reserved.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
//go:build (ppc64 || ppc64le) && !purego
|
||||||
|
|
||||||
|
package sm3
|
||||||
|
|
||||||
|
func kdf(baseMD *digest, keyLen int, limit int) []byte {
|
||||||
|
if limit < 4 {
|
||||||
|
return kdfGeneric(baseMD, keyLen, limit)
|
||||||
|
}
|
||||||
|
|
||||||
|
return kdfBy4(baseMD, keyLen, limit)
|
||||||
|
}
|
@ -5,13 +5,32 @@
|
|||||||
//go:build (ppc64 || ppc64le) && !purego
|
//go:build (ppc64 || ppc64le) && !purego
|
||||||
|
|
||||||
#include "textflag.h"
|
#include "textflag.h"
|
||||||
|
#include "sm3_const_asm.s"
|
||||||
|
|
||||||
// For P9 instruction emulation
|
#define a V0
|
||||||
#define ESPERMW V21 // Endian swapping permute into BE
|
#define e V1
|
||||||
#define TMP2 V22 // Temporary for STOREWORDS
|
#define b V2
|
||||||
|
#define f V3
|
||||||
|
#define c V4
|
||||||
|
#define g V5
|
||||||
|
#define d V6
|
||||||
|
#define h V7
|
||||||
|
#define M0 V8
|
||||||
|
#define M1 V9
|
||||||
|
#define M2 V10
|
||||||
|
#define M3 V11
|
||||||
|
#define TMP0 V12
|
||||||
|
#define TMP1 V13
|
||||||
|
#define TMP2 V14
|
||||||
|
#define TMP3 V15
|
||||||
|
#define TMP4 V16
|
||||||
|
#define TMP5 V17
|
||||||
|
|
||||||
DATA ·mask+0x00(SB)/8, $0x0c0d0e0f08090a0b // Permute for vector doubleword endian swap
|
// For instruction emulation
|
||||||
DATA ·mask+0x08(SB)/8, $0x0405060700010203
|
#define ESPERMW V31 // Endian swapping permute into BE
|
||||||
|
|
||||||
|
DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word
|
||||||
|
DATA ·mask+0x18(SB)/8, $0x0302010007060504
|
||||||
DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix
|
DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix
|
||||||
DATA ·mask+0x18(SB)/8, $0x0405060714151617
|
DATA ·mask+0x18(SB)/8, $0x0405060714151617
|
||||||
DATA ·mask+0x20(SB)/8, $0x08090a0b18191a1b
|
DATA ·mask+0x20(SB)/8, $0x08090a0b18191a1b
|
||||||
@ -20,27 +39,52 @@ DATA ·mask+0x30(SB)/8, $0x0001020304050607
|
|||||||
DATA ·mask+0x38(SB)/8, $0x1011121314151617
|
DATA ·mask+0x38(SB)/8, $0x1011121314151617
|
||||||
DATA ·mask+0x40(SB)/8, $0x08090a0b0c0d0e0f
|
DATA ·mask+0x40(SB)/8, $0x08090a0b0c0d0e0f
|
||||||
DATA ·mask+0x48(SB)/8, $0x18191a1b1c1d1e1f
|
DATA ·mask+0x48(SB)/8, $0x18191a1b1c1d1e1f
|
||||||
DATA ·mask+0x50(SB)/8, $0x0b0a09080f0e0d0c // Permute for vector doubleword endian swap
|
|
||||||
DATA ·mask+0x58(SB)/8, $0x0302010007060504
|
GLOBL ·mask(SB), RODATA, $80
|
||||||
GLOBL ·mask(SB), RODATA, $96
|
|
||||||
|
|
||||||
#ifdef GOARCH_ppc64le
|
#ifdef GOARCH_ppc64le
|
||||||
#define NEEDS_ESPERM
|
#define NEEDS_PERMW
|
||||||
|
|
||||||
#define LOADWORDS(RA,RB,VT) \
|
#define PPC64X_STXVD2X(VS,RA,RB) \
|
||||||
LXVD2X (RA+RB), VT \
|
VPERM VS, VS, ESPERMW, TMP5 \ // byte swap per word
|
||||||
|
STXVD2X TMP5, (RA+RB)
|
||||||
|
|
||||||
|
#define PPC64X_LXVW4X(RA,RB,VT) \
|
||||||
|
LXVW4X (RA+RB), VT \
|
||||||
VPERM VT, VT, ESPERMW, VT
|
VPERM VT, VT, ESPERMW, VT
|
||||||
|
|
||||||
#define STOREWORDS(VS,RA,RB) \
|
|
||||||
VPERM VS, VS, ESPERMW, TMP2 \
|
|
||||||
STXVD2X TMP2, (RA+RB)
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define LOADWORDS(RA,RB,VT) LXVD2X (RA+RB), VT
|
#define PPC64X_STXVD2X(VS,RA,RB) STXVD2X VS, (RA+RB)
|
||||||
#define STOREWORDS(VS,RA,RB) STXVD2X VS, (RA+RB)
|
#define PPC64X_LXVW4X(RA,RB,VT) LXVW4X (RA+RB), VT
|
||||||
#endif // defined(GOARCH_ppc64le)
|
#endif // defined(GOARCH_ppc64le)
|
||||||
|
|
||||||
#define TRANSPOSE_MATRIX(T0, T1, T2, T3, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) \
|
// r = s <<< n
|
||||||
|
#define PROLD(s, r, n) \
|
||||||
|
VSPLTISW $n, TMP5 \
|
||||||
|
VRLW s, TMP5, r
|
||||||
|
|
||||||
|
#define loadWordByIndex(W, r, i) \
|
||||||
|
MOVD $(16*(i)), r \
|
||||||
|
LXVW4X (r)(statePtr), W
|
||||||
|
|
||||||
|
// one word is 16 bytes
|
||||||
|
#define prepare4Words \
|
||||||
|
PPC64X_LXVW4X(srcPtr1, srcPtrPtr, V16); \
|
||||||
|
PPC64X_LXVW4X(srcPtr2, srcPtrPtr, V17); \
|
||||||
|
PPC64X_LXVW4X(srcPtr3, srcPtrPtr, V18); \
|
||||||
|
PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \
|
||||||
|
TRANSPOSE_MATRIX(V16, V17, V18, V19); \
|
||||||
|
ADD $16, srcPtrPtr
|
||||||
|
STXVW4X V16, (wordPtr); \
|
||||||
|
ADD $16, wordPtr; \
|
||||||
|
STXVW4X V17, (wordPtr); \
|
||||||
|
ADD $16, wordPtr; \
|
||||||
|
STXVW4X V18, (wordPtr); \
|
||||||
|
ADD $16, wordPtr; \
|
||||||
|
STXVW4X V19, (wordPtr); \
|
||||||
|
ADD $16, wordPtr
|
||||||
|
|
||||||
|
#define TRANSPOSE_MATRIX(T0, T1, T2, T3) \
|
||||||
VPERM T0, T1, M0, TMP0; \
|
VPERM T0, T1, M0, TMP0; \
|
||||||
VPERM T2, T3, M0, TMP1; \
|
VPERM T2, T3, M0, TMP1; \
|
||||||
VPERM T0, T1, M1, TMP2; \
|
VPERM T0, T1, M1, TMP2; \
|
||||||
@ -50,112 +94,309 @@ GLOBL ·mask(SB), RODATA, $96
|
|||||||
VPERM TMP2, TMP3, M2, T2; \
|
VPERM TMP2, TMP3, M2, T2; \
|
||||||
VPERM TMP2, TMP3, M3, T3
|
VPERM TMP2, TMP3, M3, T3
|
||||||
|
|
||||||
// transposeMatrix(dig **[8]uint32)
|
#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
|
||||||
TEXT ·transposeMatrix(SB),NOSPLIT,$0
|
PROLD(a, TMP0, 12) \
|
||||||
MOVD dig+0(FP), R3
|
VOR TMP0, TMP0, TMP1 \
|
||||||
MOVD $8, R5
|
VSPLTISW $const, TMP2 \
|
||||||
MOVD $16, R6
|
VADDUWM TMP2, TMP0, TMP0 \
|
||||||
MOVD $24, R7
|
VADDUWM e, TMP0, TMP0 \
|
||||||
MOVD $32, R8
|
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1
|
||||||
MOVD $48, R9
|
VXOR TMP2, TMP1, TMP0 \ // TMP0 = SS2
|
||||||
|
VXOR a, b, TMP1 \
|
||||||
|
VXOR c, TMP1, TMP1 \
|
||||||
|
VADDUWM TMP1, d, TMP1 \ // TMP1 = (a XOR b XOR c) + d
|
||||||
|
loadWordByIndex(TMP3, index) \
|
||||||
|
loadWordByIndex(TMP4, index+4) \
|
||||||
|
VXOR TMP3, TMP4, TMP4 \
|
||||||
|
VADDUWM TMP4, TMP1, TMP1 \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
|
||||||
|
VADDUWM TMP1, TMP0, TMP1 \ // TMP1 = TT1
|
||||||
|
VADDUWM h, TMP3, TMP3 \
|
||||||
|
VADDUWM TMP3, TMP2, TMP3 \ // Wt + h + SS1
|
||||||
|
VXOR e, f, TMP4 \
|
||||||
|
VXOR g, TMP4, TMP4 \
|
||||||
|
VADDUWM TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||||
|
VOR b, b, TMP4 \
|
||||||
|
PROLD(TMP4, b, 9) \ // b = b <<< 9
|
||||||
|
VOR TMP1, TMP1, h \ // h = TT1
|
||||||
|
VOR f, f, TMP4 \
|
||||||
|
PROLD(TMP4, f, 19) \ // f = f <<< 19
|
||||||
|
PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9
|
||||||
|
PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17
|
||||||
|
VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9)
|
||||||
|
VXOR TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
|
||||||
|
|
||||||
#ifdef NEEDS_ESPERM
|
#define MESSAGE_SCHEDULE(index) \
|
||||||
|
loadWordByIndex(TMP0, index+1) \ // Wj-3
|
||||||
|
PROLD(TMP0, TMP1, 15) \
|
||||||
|
loadWordByIndex(TMP0, index-12) \ // Wj-16
|
||||||
|
VXOR TMP0, TMP1, TMP0 \
|
||||||
|
loadWordByIndex(TMP1, index-5) \ // Wj-9
|
||||||
|
VXOR TMP0, TMP1, TMP0 \
|
||||||
|
PROLD(TMP0, TMP1, 15) \
|
||||||
|
PROLD(TMP1, TMP2, 8) \
|
||||||
|
VXOR TMP1, TMP0, TMP0 \
|
||||||
|
VXOR TMP2, TMP0, TMP0 \ // P1
|
||||||
|
loadWordByIndex(TMP1, index-9) \ // Wj-13
|
||||||
|
PROLD(TMP1, TMP2, 7) \
|
||||||
|
VXOR TMP2, TMP0, TMP0 \
|
||||||
|
loadWordByIndex(TMP1, index-2) \ // Wj-6
|
||||||
|
VXOR TMP1, TMP0, TMP1 \
|
||||||
|
STXVW4X TMP1, (wordPtr) \
|
||||||
|
ADD $16, wordPtr \
|
||||||
|
|
||||||
|
#define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
|
||||||
|
MESSAGE_SCHEDULE(index) \
|
||||||
|
ROUND_00_11(index, const, a, b, c, d, e, f, g, h)
|
||||||
|
|
||||||
|
#define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
|
||||||
|
MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it
|
||||||
|
PROLD(a, TMP0, 12) \
|
||||||
|
VOR TMP0, TMP0, TMP4 \
|
||||||
|
VSPLTISW $const, TMP2 \
|
||||||
|
VADDUWM TMP2, TMP0, TMP0 \
|
||||||
|
VADDUWM e, TMP0, TMP0 \
|
||||||
|
PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1
|
||||||
|
VXOR TMP2, TMP4, TMP0 \ // TMP0 = SS2
|
||||||
|
VOR a, b, TMP3 \
|
||||||
|
VAND a, b, TMP4 \
|
||||||
|
VAND c, TMP3, TMP3 \
|
||||||
|
VOR TMP4, TMP3, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c)
|
||||||
|
VADDUWM TMP4, d, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
||||||
|
loadWordByIndex(TMP3, index) \ // Wj
|
||||||
|
VXOR TMP3, TMP1, TMP1 \ // Wj XOR Wj+4
|
||||||
|
VADDUWM TMP4, TMP1, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
|
||||||
|
VADDUWM TMP4, TMP0, TMP4 \ // TT1
|
||||||
|
VADDUWM h, TMP3, TMP3 \ // Wt + h
|
||||||
|
VADDUWM TMP2, TMP3, TMP3 \ // Wt + h + SS1
|
||||||
|
VXOR f, g, TMP1 \
|
||||||
|
VAND TMP1, e, TMP1 \
|
||||||
|
VXOR g, TMP1, TMP1 \ // (f XOR g) AND e XOR g
|
||||||
|
VADDUWM TMP3, TMP1, TMP3 \ // TT2
|
||||||
|
VOR b, b, TMP1 \
|
||||||
|
PROLD(TMP1, b, 9) \ // b = b <<< 9
|
||||||
|
VOR TMP4, TMP4, h \ // h = TT1
|
||||||
|
VOR f, f, TMP1 \
|
||||||
|
PROLD(TMP1, f, 19) \ // f = f <<< 19
|
||||||
|
PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9
|
||||||
|
PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17
|
||||||
|
VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9)
|
||||||
|
VXOR TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
|
||||||
|
|
||||||
|
// Used general purpose registers R1-R11.
|
||||||
|
// blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
|
||||||
|
TEXT ·blockMultBy4(SB), NOSPLIT, $0
|
||||||
|
MOVD $8, R15
|
||||||
|
MOVD $16, R16
|
||||||
|
MOVD $24, R17
|
||||||
|
MOVD $32, R18
|
||||||
|
MOVD $48, R19
|
||||||
|
#ifdef NEEDS_PERMW
|
||||||
MOVD $·mask(SB), R4
|
MOVD $·mask(SB), R4
|
||||||
LVX (R4), ESPERMW
|
LVX (R4), ESPERMW
|
||||||
ADD $0x10, R4
|
ADD $0x10, R4
|
||||||
#else
|
#else
|
||||||
MOVD $·mask+0x10(SB), R4
|
MOVD $·mask+0x10(SB), R4
|
||||||
#endif
|
#endif
|
||||||
LXVD2X (R0)(R4), V8
|
LXVD2X (R0)(R4), M0
|
||||||
LXVD2X (R6)(R4), V9
|
LXVD2X (R16)(R4), M1
|
||||||
LXVD2X (R8)(R4), V10
|
LXVD2X (R18)(R4), M2
|
||||||
LXVD2X (R9)(R4), V11
|
LXVD2X (R19)(R4), M3
|
||||||
|
#define digPtr R11
|
||||||
|
#define srcPtrPtr R5
|
||||||
|
#define statePtr R4
|
||||||
|
#define blockCount R6
|
||||||
|
#define srcPtr1 R7
|
||||||
|
#define srcPtr2 R8
|
||||||
|
#define srcPtr3 R9
|
||||||
|
#define srcPtr4 R10
|
||||||
|
#define wordPtr R12
|
||||||
|
MOVD dig+0(FP), digPtr
|
||||||
|
MOVD p+8(FP), srcPtrPtr
|
||||||
|
MOVD buffer+16(FP), statePtr
|
||||||
|
MOVD blocks+24(FP), blockCount
|
||||||
|
|
||||||
MOVD (R0)(R3), R4
|
// load state
|
||||||
LXVW4X (R0)(R4), V0
|
MOVD (R0)(digPtr), R4
|
||||||
LXVW4X (R6)(R4), V4
|
LXVW4X (R0)(R4), a
|
||||||
MOVD (R5)(R3), R4
|
LXVW4X (R16)(R4), e
|
||||||
LXVW4X (R0)(R4), V1
|
MOVD (R15)(digPtr), R4
|
||||||
LXVW4X (R6)(R4), V5
|
LXVW4X (R0)(R4), b
|
||||||
MOVD (R6)(R3), R4
|
LXVW4X (R16)(R4), f
|
||||||
LXVW4X (R0)(R4), V2
|
MOVD (R16)(digPtr), R4
|
||||||
LXVW4X (R6)(R4), V6
|
LXVW4X (R0)(R4), c
|
||||||
MOVD (R7)(R3), R4
|
LXVW4X (R16)(R4), g
|
||||||
LXVW4X (R0)(R4), V3
|
MOVD (R17)(digPtr), R4
|
||||||
LXVW4X (R6)(R4), V7
|
LXVW4X (R0)(R4), d
|
||||||
|
LXVW4X (R16)(R4), h
|
||||||
|
|
||||||
|
TRANSPOSE_MATRIX(a, b, c, d)
|
||||||
|
TRANSPOSE_MATRIX(e, f, g, h)
|
||||||
|
|
||||||
TRANSPOSE_MATRIX(V0, V1, V2, V3, V8, V9, V10, V11, V12, V13, V14, V15)
|
MOVD (R0)(srcPtrPtr), srcPtr1
|
||||||
TRANSPOSE_MATRIX(V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15)
|
MOVD (R15)(srcPtrPtr), srcPtr2
|
||||||
|
MOVD (R16)(srcPtrPtr), srcPtr3
|
||||||
|
MOVD (R17)(srcPtrPtr), srcPtr4
|
||||||
|
MOVD $0, srcPtrPtr
|
||||||
|
|
||||||
MOVD (R0)(R3), R4
|
MOVD blockCount, CTR
|
||||||
VSPLTISW $4, TMP2
|
|
||||||
VRLW V0, TMP2, V0
|
loop:
|
||||||
VRLW V4, TMP2, V4
|
// Offload to VSR24-31 (aka FPR24-31)
|
||||||
STXVW4X V0, (R0)(R4)
|
XXLOR V0, V0, VS24
|
||||||
STXVW4X V4, (R6)(R4)
|
XXLOR V1, V1, VS25
|
||||||
MOVD (R5)(R3), R4
|
XXLOR V2, V2, VS26
|
||||||
STXVW4X V1, (R0)(R4)
|
XXLOR V3, V3, VS27
|
||||||
STXVW4X V5, (R6)(R4)
|
XXLOR V4, V4, VS28
|
||||||
MOVD (R6)(R3), R4
|
XXLOR V5, V5, VS29
|
||||||
STXVW4X V2, (R0)(R4)
|
XXLOR V6, V6, VS30
|
||||||
STXVW4X V6, (R6)(R4)
|
XXLOR V7, V7, VS31
|
||||||
MOVD (R7)(R3), R4
|
|
||||||
STXVW4X V3, (R0)(R4)
|
// reset wordPtr
|
||||||
STXVW4X V7, (R6)(R4)
|
MOVD statePtr, wordPtr
|
||||||
|
|
||||||
|
// load message block
|
||||||
|
prepare4Words
|
||||||
|
prepare4Words
|
||||||
|
prepare4Words
|
||||||
|
prepare4Words
|
||||||
|
|
||||||
|
ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
|
||||||
|
ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
|
||||||
|
|
||||||
|
ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
|
||||||
|
|
||||||
|
ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
|
||||||
|
ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
|
||||||
|
ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
|
||||||
|
ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
|
||||||
|
ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
|
||||||
|
ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
|
||||||
|
ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
|
||||||
|
ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
|
||||||
|
ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
|
||||||
|
ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
|
||||||
|
ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
|
||||||
|
ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
|
||||||
|
ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
|
||||||
|
|
||||||
|
XXLXOR V0, VS24, V0
|
||||||
|
XXLXOR V1, VS25, V1
|
||||||
|
XXLXOR V2, VS26, V2
|
||||||
|
XXLXOR V3, VS27, V3
|
||||||
|
XXLXOR V4, VS28, V4
|
||||||
|
XXLXOR V5, VS29, V5
|
||||||
|
XXLXOR V6, VS30, V6
|
||||||
|
XXLXOR V7, VS31, V7
|
||||||
|
|
||||||
|
BDNZ loop
|
||||||
|
|
||||||
|
end:
|
||||||
|
TRANSPOSE_MATRIX(a, b, c, d)
|
||||||
|
TRANSPOSE_MATRIX(e, f, g, h)
|
||||||
|
|
||||||
|
// save state
|
||||||
|
MOVD (R0)(digPtr), R4
|
||||||
|
STXVW4X a, (R0)(R4)
|
||||||
|
STXVW4X e, (R16)(R4)
|
||||||
|
MOVD (R15)(digPtr), R4
|
||||||
|
STXVW4X b, (R0)(R4)
|
||||||
|
STXVW4X f, (R16)(R4)
|
||||||
|
MOVD (R16)(digPtr), R4
|
||||||
|
STXVW4X c, (R0)(R4)
|
||||||
|
STXVW4X g, (R16)(R4)
|
||||||
|
MOVD (R17)(digPtr), R4
|
||||||
|
STXVW4X d, (R0)(R4)
|
||||||
|
STXVW4X h, (R16)(R4)
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
#ifdef GOARCH_ppc64le
|
|
||||||
#define NEEDS_PERMW
|
|
||||||
|
|
||||||
#define PPC64X_STXVD2X(VS,RA,RB) \
|
|
||||||
VPERM VS, VS, ESPERMW, TMP2 \
|
|
||||||
STXVD2X TMP2, (RA+RB)
|
|
||||||
|
|
||||||
#else
|
|
||||||
#define PPC64X_STXVD2X(VS,RA,RB) STXVD2X VS, (RA+RB)
|
|
||||||
#endif // defined(GOARCH_ppc64le)
|
|
||||||
|
|
||||||
// func copyResultsBy4(dig *uint32, dst *byte)
|
// func copyResultsBy4(dig *uint32, dst *byte)
|
||||||
TEXT ·copyResultsBy4(SB),NOSPLIT,$0
|
TEXT ·copyResultsBy4(SB),NOSPLIT,$0
|
||||||
MOVD dig+0(FP), R3
|
MOVD dig+0(FP), R6
|
||||||
MOVD dst+8(FP), R4
|
MOVD dst+8(FP), R4
|
||||||
|
|
||||||
#ifdef NEEDS_PERMW
|
#ifdef NEEDS_PERMW
|
||||||
MOVD $·mask+0x50(SB), R5
|
MOVD $·mask+0x00(SB), R5
|
||||||
LVX (R5), ESPERMW
|
LVX (R5), ESPERMW
|
||||||
#endif
|
#endif
|
||||||
|
MOVD $16, R5
|
||||||
|
MOVD $32, R16
|
||||||
|
MOVD $48, R17
|
||||||
|
MOVD $64, R18
|
||||||
|
MOVD $80, R19
|
||||||
|
MOVD $96, R8
|
||||||
|
MOVD $112, R9
|
||||||
|
|
||||||
LXVD2X (R0)(R3), V0
|
LXVD2X (R0)(R6), V0
|
||||||
PPC64X_STXVD2X(V0, R0, R4)
|
PPC64X_STXVD2X(V0, R0, R4)
|
||||||
|
|
||||||
MOVD $16, R5
|
LXVD2X (R5)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
PPC64X_STXVD2X(V0, R5, R4)
|
||||||
|
|
||||||
ADD $16, R5
|
LXVD2X (R16)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
PPC64X_STXVD2X(V0, R16, R4)
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
|
||||||
|
|
||||||
ADD $16, R5
|
LXVD2X (R17)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
PPC64X_STXVD2X(V0, R17, R4)
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
|
||||||
|
|
||||||
ADD $16, R5
|
LXVD2X (R18)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
PPC64X_STXVD2X(V0, R18, R4)
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
|
||||||
|
|
||||||
ADD $16, R5
|
LXVD2X (R19)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
PPC64X_STXVD2X(V0, R19, R4)
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
|
||||||
|
|
||||||
ADD $16, R5
|
LXVD2X (R8)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
PPC64X_STXVD2X(V0, R8, R4)
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
|
||||||
|
|
||||||
ADD $16, R5
|
LXVD2X (R9)(R6), V0
|
||||||
LXVD2X (R5)(R3), V0
|
PPC64X_STXVD2X(V0, R9, R4)
|
||||||
PPC64X_STXVD2X(V0, R5, R4)
|
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
//go:build (amd64 || arm64 || s390x) && !purego
|
//go:build (amd64 || arm64 || s390x || ppc64 || ppc64le) && !purego
|
||||||
|
|
||||||
package sm3
|
package sm3
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user