mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-26 04:06:18 +08:00
88 lines
2.1 KiB
ArmAsm
88 lines
2.1 KiB
ArmAsm
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:build !purego && (ppc64 || ppc64le)
|
|
|
|
#include "textflag.h"
|
|
|
|
// func addMulVVW256(z, x *uint, y uint) (c uint)
|
|
TEXT ·addMulVVW256(SB), $0-32
|
|
MOVD $1, R6 // R6 = z_len/4
|
|
JMP addMulVVWy<>(SB)
|
|
|
|
// func addMulVVW1024(z, x *uint, y uint) (c uint)
|
|
TEXT ·addMulVVW1024(SB), $0-32
|
|
MOVD $4, R6 // R6 = z_len/4
|
|
JMP addMulVVWy<>(SB)
|
|
|
|
// func addMulVVW1536(z, x *uint, y uint) (c uint)
|
|
TEXT ·addMulVVW1536(SB), $0-32
|
|
MOVD $6, R6 // R6 = z_len/4
|
|
JMP addMulVVWy<>(SB)
|
|
|
|
// func addMulVVW2048(z, x *uint, y uint) (c uint)
|
|
TEXT ·addMulVVW2048(SB), $0-32
|
|
MOVD $8, R6 // R6 = z_len/4
|
|
JMP addMulVVWy<>(SB)
|
|
|
|
// This local function expects to be called only by
|
|
// callers above. R6 contains the z length/4
|
|
// since 4 values are processed for each
|
|
// loop iteration, and is guaranteed to be > 0.
|
|
// If other callers are added this function might
|
|
// need to change.
|
|
TEXT addMulVVWy<>(SB), NOSPLIT, $0
|
|
MOVD z+0(FP), R3
|
|
MOVD x+8(FP), R4
|
|
MOVD y+16(FP), R5
|
|
|
|
MOVD $0, R9 // R9 = c = 0
|
|
MOVD R6, CTR // Initialize loop counter
|
|
PCALIGN $16
|
|
|
|
loop:
|
|
MOVD 0(R4), R14 // x[i]
|
|
MOVD 8(R4), R16 // x[i+1]
|
|
MOVD 16(R4), R18 // x[i+2]
|
|
MOVD 24(R4), R20 // x[i+3]
|
|
MOVD 0(R3), R15 // z[i]
|
|
MOVD 8(R3), R17 // z[i+1]
|
|
MOVD 16(R3), R19 // z[i+2]
|
|
MOVD 24(R3), R21 // z[i+3]
|
|
MULLD R5, R14, R10 // low x[i]*y
|
|
MULHDU R5, R14, R11 // high x[i]*y
|
|
ADDC R15, R10
|
|
ADDZE R11
|
|
ADDC R9, R10
|
|
ADDZE R11, R9
|
|
MULLD R5, R16, R14 // low x[i+1]*y
|
|
MULHDU R5, R16, R15 // high x[i+1]*y
|
|
ADDC R17, R14
|
|
ADDZE R15
|
|
ADDC R9, R14
|
|
ADDZE R15, R9
|
|
MULLD R5, R18, R16 // low x[i+2]*y
|
|
MULHDU R5, R18, R17 // high x[i+2]*y
|
|
ADDC R19, R16
|
|
ADDZE R17
|
|
ADDC R9, R16
|
|
ADDZE R17, R9
|
|
MULLD R5, R20, R18 // low x[i+3]*y
|
|
MULHDU R5, R20, R19 // high x[i+3]*y
|
|
ADDC R21, R18
|
|
ADDZE R19
|
|
ADDC R9, R18
|
|
ADDZE R19, R9
|
|
MOVD R10, 0(R3) // z[i]
|
|
MOVD R14, 8(R3) // z[i+1]
|
|
MOVD R16, 16(R3) // z[i+2]
|
|
MOVD R18, 24(R3) // z[i+3]
|
|
ADD $32, R3
|
|
ADD $32, R4
|
|
BDNZ loop
|
|
|
|
done:
|
|
MOVD R9, c+24(FP)
|
|
RET
|