mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-24 19:26:18 +08:00
507 lines
10 KiB
ArmAsm
507 lines
10 KiB
ArmAsm
//go:build arm64 && !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
#define B0 V0
|
|
#define B1 V1
|
|
#define B2 V2
|
|
#define B3 V3
|
|
#define B4 V4
|
|
#define B5 V5
|
|
#define B6 V6
|
|
#define B7 V7
|
|
|
|
#define POLY V8
|
|
#define ZERO V9
|
|
#define TW V10
|
|
|
|
#define T0 V11
|
|
#define T1 V12
|
|
#define T2 V13
|
|
#define T3 V14
|
|
#define T4 V15
|
|
#define T5 V16
|
|
#define T6 V17
|
|
#define T7 V18
|
|
|
|
#define RK0 V19
|
|
#define RK1 V20
|
|
#define RK2 V21
|
|
#define RK3 V22
|
|
#define RK4 V23
|
|
#define RK5 V24
|
|
#define RK6 V25
|
|
#define RK7 V26
|
|
|
|
#define K0 V27
|
|
#define K1 V28
|
|
|
|
#include "sm4ni_macros_arm64.s"
|
|
#include "xts_macros_arm64.s"
|
|
|
|
#define load8blocks \
|
|
VLD1.P 64(srcPtr), [B0.S4, B1.S4, B2.S4, B3.S4]; \
|
|
VEOR T0.B16, B0.B16, B0.B16; \
|
|
VEOR T1.B16, B1.B16, B1.B16; \
|
|
VEOR T2.B16, B2.B16, B2.B16; \
|
|
VEOR T3.B16, B3.B16, B3.B16; \
|
|
\
|
|
VLD1.P 64(srcPtr), [B4.S4, B5.S4, B6.S4, B7.S4]; \
|
|
VEOR T4.B16, B4.B16, B4.B16; \
|
|
VEOR T5.B16, B5.B16, B5.B16; \
|
|
VEOR T6.B16, B6.B16, B6.B16; \
|
|
VEOR T7.B16, B7.B16, B7.B16; \
|
|
\
|
|
VREV32 B0.B16, B0.B16; \
|
|
VREV32 B1.B16, B1.B16; \
|
|
VREV32 B2.B16, B2.B16; \
|
|
VREV32 B3.B16, B3.B16; \
|
|
VREV32 B4.B16, B4.B16; \
|
|
VREV32 B5.B16, B5.B16; \
|
|
VREV32 B6.B16, B6.B16; \
|
|
VREV32 B7.B16, B7.B16
|
|
|
|
#define store8blocks \
|
|
VEOR T0.B16, B0.B16, B0.B16; \
|
|
VEOR T1.B16, B1.B16, B1.B16; \
|
|
VEOR T2.B16, B2.B16, B2.B16; \
|
|
VEOR T3.B16, B3.B16, B3.B16; \
|
|
VEOR T4.B16, B4.B16, B4.B16; \
|
|
VEOR T5.B16, B5.B16, B5.B16; \
|
|
VEOR T6.B16, B6.B16, B6.B16; \
|
|
VEOR T7.B16, B7.B16, B7.B16; \
|
|
\
|
|
VST1.P [B0.S4, B1.S4, B2.S4, B3.S4], 64(dstPtr); \
|
|
VST1.P [B4.S4, B5.S4, B6.S4, B7.S4], 64(dstPtr)
|
|
|
|
#define dstPtr R2
|
|
#define srcPtr R3
|
|
#define rk R0
|
|
#define twPtr R1
|
|
#define srcPtrLen R4
|
|
#define I R5
|
|
|
|
// func encryptSm4NiXts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
|
TEXT ·encryptSm4NiXts(SB),0,$128-64
|
|
MOVD xk+0(FP), rk
|
|
MOVD tweak+8(FP), twPtr
|
|
MOVD dst+16(FP), dstPtr
|
|
MOVD src+40(FP), srcPtr
|
|
MOVD src_len+48(FP), srcPtrLen
|
|
|
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
|
|
|
MOVD $0x87, I
|
|
VMOV I, POLY.D[0]
|
|
|
|
// For SM4 round keys are stored in: RK0 .. RK7
|
|
VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
|
|
VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
|
|
|
|
VLD1 (twPtr), [TW.B16]
|
|
|
|
xtsSm4EncOctets:
|
|
CMP $128, srcPtrLen
|
|
BLT xtsSm4EncSingles
|
|
SUB $128, srcPtrLen
|
|
prepare8Tweaks
|
|
load8blocks
|
|
sm4eEnc8blocks()
|
|
store8blocks
|
|
|
|
B xtsSm4EncOctets
|
|
|
|
xtsSm4EncSingles:
|
|
CMP $16, srcPtrLen
|
|
BLT xtsSm4EncTail
|
|
SUB $16, srcPtrLen
|
|
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
|
|
mul2Inline
|
|
B xtsSm4EncSingles
|
|
|
|
xtsSm4EncTail:
|
|
CBZ srcPtrLen, xtsSm4EncDone
|
|
SUB $16, dstPtr, R7
|
|
MOVD R7, R9
|
|
MOVD RSP, R8
|
|
VLD1 (R7), [B0.B16]
|
|
VST1 [B0.B16], (R8)
|
|
|
|
TBZ $3, srcPtrLen, less_than8
|
|
MOVD.P 8(srcPtr), R11
|
|
MOVD.P R11, 8(R8)
|
|
MOVD.P 8(R7), R12
|
|
MOVD.P R12, 8(dstPtr)
|
|
|
|
less_than8:
|
|
TBZ $2, srcPtrLen, less_than4
|
|
MOVWU.P 4(srcPtr), R11
|
|
MOVWU.P R11, 4(R8)
|
|
MOVWU.P 4(R7), R12
|
|
MOVWU.P R12, 4(dstPtr)
|
|
|
|
less_than4:
|
|
TBZ $1, srcPtrLen, less_than2
|
|
MOVHU.P 2(srcPtr), R11
|
|
MOVHU.P R11, 2(R8)
|
|
MOVHU.P 2(R7), R12
|
|
MOVHU.P R12, 2(dstPtr)
|
|
|
|
less_than2:
|
|
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
|
|
MOVBU (srcPtr), R11
|
|
MOVBU R11, (R8)
|
|
MOVBU (R7), R12
|
|
MOVBU R12, (dstPtr)
|
|
|
|
xtsSm4EncTailEnc:
|
|
VLD1 (RSP), [B0.B16]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1 [B0.B16], (R9)
|
|
|
|
xtsSm4EncDone:
|
|
VST1 [TW.B16], (twPtr)
|
|
RET
|
|
|
|
// func encryptSm4NiXtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
|
TEXT ·encryptSm4NiXtsGB(SB),0,$128-64
|
|
MOVD xk+0(FP), rk
|
|
MOVD tweak+8(FP), twPtr
|
|
MOVD dst+16(FP), dstPtr
|
|
MOVD src+40(FP), srcPtr
|
|
MOVD src_len+48(FP), srcPtrLen
|
|
|
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
|
|
|
MOVD $0xE1, I
|
|
LSL $56, I
|
|
VMOV I, POLY.D[1]
|
|
|
|
// For SM4 round keys are stored in: RK0 .. RK7
|
|
VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
|
|
VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
|
|
|
|
VLD1 (twPtr), [TW.B16]
|
|
|
|
xtsSm4EncOctets:
|
|
CMP $128, srcPtrLen
|
|
BLT xtsSm4EncSingles
|
|
SUB $128, srcPtrLen
|
|
prepareGB8Tweaks
|
|
load8blocks
|
|
sm4eEnc8blocks()
|
|
store8blocks
|
|
|
|
B xtsSm4EncOctets
|
|
|
|
xtsSm4EncSingles:
|
|
CMP $16, srcPtrLen
|
|
BLT xtsSm4EncTail
|
|
SUB $16, srcPtrLen
|
|
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
|
|
mul2GBInline
|
|
B xtsSm4EncSingles
|
|
|
|
xtsSm4EncTail:
|
|
CBZ srcPtrLen, xtsSm4EncDone
|
|
SUB $16, dstPtr, R7
|
|
MOVD R7, R9
|
|
MOVD RSP, R8
|
|
VLD1 (R7), [B0.B16]
|
|
VST1 [B0.B16], (R8)
|
|
|
|
TBZ $3, srcPtrLen, less_than8
|
|
MOVD.P 8(srcPtr), R11
|
|
MOVD.P R11, 8(R8)
|
|
MOVD.P 8(R7), R12
|
|
MOVD.P R12, 8(dstPtr)
|
|
|
|
less_than8:
|
|
TBZ $2, srcPtrLen, less_than4
|
|
MOVWU.P 4(srcPtr), R11
|
|
MOVWU.P R11, 4(R8)
|
|
MOVWU.P 4(R7), R12
|
|
MOVWU.P R12, 4(dstPtr)
|
|
|
|
less_than4:
|
|
TBZ $1, srcPtrLen, less_than2
|
|
MOVHU.P 2(srcPtr), R11
|
|
MOVHU.P R11, 2(R8)
|
|
MOVHU.P 2(R7), R12
|
|
MOVHU.P R12, 2(dstPtr)
|
|
|
|
less_than2:
|
|
TBZ $0, srcPtrLen, xtsSm4EncTailEnc
|
|
MOVBU (srcPtr), R11
|
|
MOVBU R11, (R8)
|
|
MOVBU (R7), R12
|
|
MOVBU R12, (dstPtr)
|
|
|
|
xtsSm4EncTailEnc:
|
|
VLD1 (RSP), [B0.B16]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1 [B0.B16], (R9)
|
|
|
|
xtsSm4EncDone:
|
|
VST1 [TW.B16], (twPtr)
|
|
RET
|
|
|
|
// func decryptSm4NiXts(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
|
TEXT ·decryptSm4NiXts(SB),0,$128-64
|
|
MOVD xk+0(FP), rk
|
|
MOVD tweak+8(FP), twPtr
|
|
MOVD dst+16(FP), dstPtr
|
|
MOVD src+40(FP), srcPtr
|
|
MOVD src_len+48(FP), srcPtrLen
|
|
|
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
|
|
|
MOVD $0x87, I
|
|
VMOV I, POLY.D[0]
|
|
|
|
// For SM4 round keys are stored in: RK0 .. RK7
|
|
VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
|
|
VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
|
|
|
|
VLD1 (twPtr), [TW.B16]
|
|
|
|
xtsSm4DecOctets:
|
|
CMP $128, srcPtrLen
|
|
BLT xtsSm4DecSingles
|
|
SUB $128, srcPtrLen
|
|
|
|
prepare8Tweaks
|
|
load8blocks
|
|
sm4eEnc8blocks()
|
|
store8blocks
|
|
|
|
B xtsSm4DecOctets
|
|
|
|
xtsSm4DecSingles:
|
|
CMP $32, srcPtrLen
|
|
BLT xtsSm4DecTail
|
|
SUB $16, srcPtrLen
|
|
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
|
|
mul2Inline
|
|
B xtsSm4DecSingles
|
|
|
|
xtsSm4DecTail:
|
|
CBZ srcPtrLen, xtsSm4DecDone
|
|
|
|
CMP $16, srcPtrLen
|
|
BEQ xtsSm4DecLastBlock
|
|
|
|
VMOV TW.B16, B4.B16
|
|
mul2Inline
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
VMOV B4.B16, TW.B16
|
|
VST1 [B0.B16], (RSP)
|
|
|
|
SUB $16, dstPtr, R7
|
|
MOVD R7, R9
|
|
MOVD RSP, R8
|
|
|
|
TBZ $3, srcPtrLen, less_than8
|
|
MOVD.P 8(srcPtr), R11
|
|
MOVD.P R11, 8(R8)
|
|
MOVD.P 8(R7), R12
|
|
MOVD.P R12, 8(dstPtr)
|
|
|
|
less_than8:
|
|
TBZ $2, srcPtrLen, less_than4
|
|
MOVWU.P 4(srcPtr), R11
|
|
MOVWU.P R11, 4(R8)
|
|
MOVWU.P 4(R7), R12
|
|
MOVWU.P R12, 4(dstPtr)
|
|
|
|
less_than4:
|
|
TBZ $1, srcPtrLen, less_than2
|
|
MOVHU.P 2(srcPtr), R11
|
|
MOVHU.P R11, 2(R8)
|
|
MOVHU.P 2(R7), R12
|
|
MOVHU.P R12, 2(dstPtr)
|
|
|
|
less_than2:
|
|
TBZ $0, srcPtrLen, xtsSm4DecTailDec
|
|
MOVBU (srcPtr), R11
|
|
MOVBU R11, (R8)
|
|
MOVBU (R7), R12
|
|
MOVBU R12, (dstPtr)
|
|
|
|
xtsSm4DecTailDec:
|
|
VLD1 (RSP), [B0.B16]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1 [B0.B16], (R9)
|
|
|
|
B xtsSm4DecDone
|
|
|
|
xtsSm4DecLastBlock:
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
mul2Inline
|
|
|
|
xtsSm4DecDone:
|
|
VST1 [TW.B16], (twPtr)
|
|
RET
|
|
|
|
// func decryptSm4NiXtsGB(xk *uint32, tweak *[BlockSize]byte, dst, src []byte)
|
|
TEXT ·decryptSm4NiXtsGB(SB),0,$128-64
|
|
MOVD xk+0(FP), rk
|
|
MOVD tweak+8(FP), twPtr
|
|
MOVD dst+16(FP), dstPtr
|
|
MOVD src+40(FP), srcPtr
|
|
MOVD src_len+48(FP), srcPtrLen
|
|
|
|
VEOR POLY.B16, POLY.B16, POLY.B16
|
|
VEOR ZERO.B16, ZERO.B16, ZERO.B16
|
|
|
|
MOVD $0xE1, I
|
|
LSL $56, I
|
|
VMOV I, POLY.D[1]
|
|
|
|
// For SM4 round keys are stored in: RK0 .. RK7
|
|
VLD1.P 64(rk), [RK0.S4, RK1.S4, RK2.S4, RK3.S4]
|
|
VLD1.P 64(rk), [RK4.S4, RK5.S4, RK6.S4, RK7.S4]
|
|
|
|
VLD1 (twPtr), [TW.B16]
|
|
|
|
xtsSm4DecOctets:
|
|
CMP $128, srcPtrLen
|
|
BLT xtsSm4DecSingles
|
|
SUB $128, srcPtrLen
|
|
|
|
prepareGB8Tweaks
|
|
load8blocks
|
|
sm4eEnc8blocks()
|
|
store8blocks
|
|
|
|
B xtsSm4DecOctets
|
|
|
|
xtsSm4DecSingles:
|
|
CMP $32, srcPtrLen
|
|
BLT xtsSm4DecTail
|
|
SUB $16, srcPtrLen
|
|
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
|
|
mul2GBInline
|
|
B xtsSm4DecSingles
|
|
|
|
xtsSm4DecTail:
|
|
CBZ srcPtrLen, xtsSm4DecDone
|
|
|
|
CMP $16, srcPtrLen
|
|
BEQ xtsSm4DecLastBlock
|
|
|
|
VMOV TW.B16, B4.B16
|
|
mul2GBInline
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
VMOV B4.B16, TW.B16
|
|
VST1 [B0.B16], (RSP)
|
|
|
|
SUB $16, dstPtr, R7
|
|
MOVD R7, R9
|
|
MOVD RSP, R8
|
|
|
|
TBZ $3, srcPtrLen, less_than8
|
|
MOVD.P 8(srcPtr), R11
|
|
MOVD.P R11, 8(R8)
|
|
MOVD.P 8(R7), R12
|
|
MOVD.P R12, 8(dstPtr)
|
|
|
|
less_than8:
|
|
TBZ $2, srcPtrLen, less_than4
|
|
MOVWU.P 4(srcPtr), R11
|
|
MOVWU.P R11, 4(R8)
|
|
MOVWU.P 4(R7), R12
|
|
MOVWU.P R12, 4(dstPtr)
|
|
|
|
less_than4:
|
|
TBZ $1, srcPtrLen, less_than2
|
|
MOVHU.P 2(srcPtr), R11
|
|
MOVHU.P R11, 2(R8)
|
|
MOVHU.P 2(R7), R12
|
|
MOVHU.P R12, 2(dstPtr)
|
|
|
|
less_than2:
|
|
TBZ $0, srcPtrLen, xtsSm4DecTailDec
|
|
MOVBU (srcPtr), R11
|
|
MOVBU R11, (R8)
|
|
MOVBU (R7), R12
|
|
MOVBU R12, (dstPtr)
|
|
|
|
xtsSm4DecTailDec:
|
|
VLD1 (RSP), [B0.B16]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1 [B0.B16], (R9)
|
|
|
|
B xtsSm4DecDone
|
|
|
|
xtsSm4DecLastBlock:
|
|
VLD1.P 16(srcPtr), [B0.S4]
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VREV32 B0.B16, B0.B16
|
|
sm4eEnc1block()
|
|
VEOR TW.B16, B0.B16, B0.B16
|
|
VST1.P [B0.S4], 16(dstPtr)
|
|
mul2GBInline
|
|
|
|
xtsSm4DecDone:
|
|
VST1 [TW.B16], (twPtr)
|
|
RET
|