mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-22 10:16:18 +08:00
sm4: ppc64x, gcm step 1 (without AESNI) #249
This commit is contained in:
parent
37493fe3df
commit
78d55a69bc
258
sm4/gcm_ppc64x.go
Normal file
258
sm4/gcm_ppc64x.go
Normal file
@ -0,0 +1,258 @@
|
||||
// Copyright 2024 Sun Yimin. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (ppc64 || ppc64le) && !purego
|
||||
|
||||
package sm4
|
||||
|
||||
import (
|
||||
"crypto/cipher"
|
||||
"crypto/subtle"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"runtime"
|
||||
|
||||
"github.com/emmansun/gmsm/internal/alias"
|
||||
)
|
||||
|
||||
// Assert that sm4CipherAsm implements the gcmAble interface.
|
||||
var _ gcmAble = (*sm4CipherAsm)(nil)
|
||||
|
||||
var errOpen = errors.New("cipher: message authentication failed")
|
||||
|
||||
//go:noescape
|
||||
func gcmInit(productTable *[256]byte, h []byte)
|
||||
|
||||
//go:noescape
|
||||
func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
|
||||
|
||||
//go:noescape
|
||||
func gcmMul(output []byte, productTable *[256]byte)
|
||||
|
||||
const (
|
||||
gcmBlockSize = 16
|
||||
gcmTagSize = 16
|
||||
gcmMinimumTagSize = 12 // NIST SP 800-38D recommends tags with 12 or more bytes.
|
||||
gcmStandardNonceSize = 12
|
||||
)
|
||||
|
||||
type gcmAsm struct {
|
||||
cipher *sm4CipherAsm
|
||||
nonceSize int
|
||||
tagSize int
|
||||
// productTable contains pre-computed multiples of the binary-field
|
||||
// element used in GHASH.
|
||||
productTable [256]byte
|
||||
}
|
||||
|
||||
// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
|
||||
// called by [crypto/cipher.NewGCM] via the gcmAble interface.
|
||||
func (c *sm4CipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
||||
var h1, h2 uint64
|
||||
g := &gcmAsm{cipher: c, nonceSize: nonceSize, tagSize: tagSize}
|
||||
|
||||
hle := make([]byte, gcmBlockSize)
|
||||
|
||||
c.Encrypt(hle, hle)
|
||||
|
||||
// Reverse the bytes in each 8 byte chunk
|
||||
// Load little endian, store big endian
|
||||
if runtime.GOARCH == "ppc64le" {
|
||||
h1 = binary.LittleEndian.Uint64(hle[:8])
|
||||
h2 = binary.LittleEndian.Uint64(hle[8:])
|
||||
} else {
|
||||
h1 = binary.BigEndian.Uint64(hle[:8])
|
||||
h2 = binary.BigEndian.Uint64(hle[8:])
|
||||
}
|
||||
binary.BigEndian.PutUint64(hle[:8], h1)
|
||||
binary.BigEndian.PutUint64(hle[8:], h2)
|
||||
gcmInit(&g.productTable, hle)
|
||||
|
||||
return g, nil
|
||||
}
|
||||
|
||||
func (g *gcmAsm) NonceSize() int {
|
||||
return g.nonceSize
|
||||
}
|
||||
|
||||
func (g *gcmAsm) Overhead() int {
|
||||
return g.tagSize
|
||||
}
|
||||
|
||||
// deriveCounter computes the initial GCM counter state from the given nonce.
|
||||
func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
|
||||
if len(nonce) == gcmStandardNonceSize {
|
||||
copy(counter[:], nonce)
|
||||
counter[gcmBlockSize-1] = 1
|
||||
} else {
|
||||
var hash [16]byte
|
||||
g.paddedGHASH(&hash, nonce)
|
||||
lens := gcmLengths(0, uint64(len(nonce))*8)
|
||||
g.paddedGHASH(&hash, lens[:])
|
||||
copy(counter[:], hash[:])
|
||||
}
|
||||
}
|
||||
|
||||
// counterCrypt encrypts in using AES in counter mode and places the result
|
||||
// into out. counter is the initial count value and will be updated with the next
|
||||
// count value. The length of out must be greater than or equal to the length
|
||||
// of in.
|
||||
// counterCryptASM implements counterCrypt which then allows the loop to
|
||||
// be unrolled and optimized.
|
||||
func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
|
||||
var mask [gcmBlockSize]byte
|
||||
|
||||
for len(in) >= gcmBlockSize {
|
||||
// Hint to avoid bounds check
|
||||
_, _ = in[15], out[15]
|
||||
g.cipher.Encrypt(mask[:], counter[:])
|
||||
gcmInc32(counter)
|
||||
|
||||
// XOR 16 bytes each loop iteration in 8 byte chunks
|
||||
in0 := binary.LittleEndian.Uint64(in[0:])
|
||||
in1 := binary.LittleEndian.Uint64(in[8:])
|
||||
m0 := binary.LittleEndian.Uint64(mask[:8])
|
||||
m1 := binary.LittleEndian.Uint64(mask[8:])
|
||||
binary.LittleEndian.PutUint64(out[:8], in0^m0)
|
||||
binary.LittleEndian.PutUint64(out[8:], in1^m1)
|
||||
out = out[16:]
|
||||
in = in[16:]
|
||||
}
|
||||
|
||||
if len(in) > 0 {
|
||||
g.cipher.Encrypt(mask[:], counter[:])
|
||||
gcmInc32(counter)
|
||||
// XOR leftover bytes
|
||||
for i, inb := range in {
|
||||
out[i] = inb ^ mask[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// increments the rightmost 32-bits of the count value by 1.
|
||||
func gcmInc32(counterBlock *[16]byte) {
|
||||
c := counterBlock[len(counterBlock)-4:]
|
||||
x := binary.BigEndian.Uint32(c) + 1
|
||||
binary.BigEndian.PutUint32(c, x)
|
||||
}
|
||||
|
||||
// paddedGHASH pads data with zeroes until its length is a multiple of
|
||||
// 16-bytes. It then calculates a new value for hash using the ghash
|
||||
// algorithm.
|
||||
func (g *gcmAsm) paddedGHASH(hash *[16]byte, data []byte) {
|
||||
if siz := len(data) - (len(data) % gcmBlockSize); siz > 0 {
|
||||
gcmHash(hash[:], &g.productTable, data[:], siz)
|
||||
data = data[siz:]
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var s [16]byte
|
||||
copy(s[:], data)
|
||||
gcmHash(hash[:], &g.productTable, s[:], len(s))
|
||||
}
|
||||
}
|
||||
|
||||
// auth calculates GHASH(ciphertext, additionalData), masks the result with
|
||||
// tagMask and writes the result to out.
|
||||
func (g *gcmAsm) auth(out, ciphertext, aad []byte, tagMask *[gcmTagSize]byte) {
|
||||
var hash [16]byte
|
||||
g.paddedGHASH(&hash, aad)
|
||||
g.paddedGHASH(&hash, ciphertext)
|
||||
lens := gcmLengths(uint64(len(aad))*8, uint64(len(ciphertext))*8)
|
||||
g.paddedGHASH(&hash, lens[:])
|
||||
|
||||
copy(out, hash[:])
|
||||
for i := range out {
|
||||
out[i] ^= tagMask[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Seal encrypts and authenticates plaintext. See the [cipher.AEAD] interface for
|
||||
// details.
|
||||
func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
|
||||
if len(nonce) != g.nonceSize {
|
||||
panic("cipher: incorrect nonce length given to GCM")
|
||||
}
|
||||
if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
|
||||
panic("cipher: message too large for GCM")
|
||||
}
|
||||
|
||||
ret, out := alias.SliceForAppend(dst, len(plaintext)+g.tagSize)
|
||||
if alias.InexactOverlap(out[:len(plaintext)], plaintext) {
|
||||
panic("cipher: invalid buffer overlap")
|
||||
}
|
||||
|
||||
var counter, tagMask [gcmBlockSize]byte
|
||||
g.deriveCounter(&counter, nonce)
|
||||
|
||||
g.cipher.Encrypt(tagMask[:], counter[:])
|
||||
gcmInc32(&counter)
|
||||
|
||||
g.counterCrypt(out, plaintext, &counter)
|
||||
g.auth(out[len(plaintext):], out[:len(plaintext)], data, &tagMask)
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// Open authenticates and decrypts ciphertext. See the [cipher.AEAD] interface
|
||||
// for details.
|
||||
func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
|
||||
if len(nonce) != g.nonceSize {
|
||||
panic("cipher: incorrect nonce length given to GCM")
|
||||
}
|
||||
if len(ciphertext) < g.tagSize {
|
||||
return nil, errOpen
|
||||
}
|
||||
if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
|
||||
return nil, errOpen
|
||||
}
|
||||
|
||||
tag := ciphertext[len(ciphertext)-g.tagSize:]
|
||||
ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
|
||||
|
||||
var counter, tagMask [gcmBlockSize]byte
|
||||
g.deriveCounter(&counter, nonce)
|
||||
|
||||
g.cipher.Encrypt(tagMask[:], counter[:])
|
||||
gcmInc32(&counter)
|
||||
|
||||
var expectedTag [gcmTagSize]byte
|
||||
g.auth(expectedTag[:], ciphertext, data, &tagMask)
|
||||
|
||||
ret, out := alias.SliceForAppend(dst, len(ciphertext))
|
||||
if alias.InexactOverlap(out, ciphertext) {
|
||||
panic("cipher: invalid buffer overlap")
|
||||
}
|
||||
|
||||
if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
|
||||
// clear(out)
|
||||
for i := range out {
|
||||
out[i] = 0
|
||||
}
|
||||
return nil, errOpen
|
||||
}
|
||||
|
||||
g.counterCrypt(out, ciphertext, &counter)
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func gcmLengths(len0, len1 uint64) [16]byte {
|
||||
return [16]byte{
|
||||
byte(len0 >> 56),
|
||||
byte(len0 >> 48),
|
||||
byte(len0 >> 40),
|
||||
byte(len0 >> 32),
|
||||
byte(len0 >> 24),
|
||||
byte(len0 >> 16),
|
||||
byte(len0 >> 8),
|
||||
byte(len0),
|
||||
byte(len1 >> 56),
|
||||
byte(len1 >> 48),
|
||||
byte(len1 >> 40),
|
||||
byte(len1 >> 32),
|
||||
byte(len1 >> 24),
|
||||
byte(len1 >> 16),
|
||||
byte(len1 >> 8),
|
||||
byte(len1),
|
||||
}
|
||||
}
|
590
sm4/gcm_ppc64x.s
Normal file
590
sm4/gcm_ppc64x.s
Normal file
@ -0,0 +1,590 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (ppc64 || ppc64le) && !purego
|
||||
|
||||
// Portions based on CRYPTOGAMS code with the following comment:
|
||||
// # ====================================================================
|
||||
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
// # project. The module is, however, dual licensed under OpenSSL and
|
||||
// # CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
// # details see http://www.openssl.org/~appro/cryptogams/.
|
||||
// # ====================================================================
|
||||
|
||||
// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
|
||||
// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
|
||||
// from commit d47afb3c.
|
||||
|
||||
// Changes were made due to differences in the ABI and some register usage.
|
||||
// Some arguments were changed due to the way the Go code passes them.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define XIP R3
|
||||
#define HTBL R4
|
||||
#define INP R5
|
||||
#define LEN R6
|
||||
|
||||
#define XL V0
|
||||
#define XM V1
|
||||
#define XH V2
|
||||
#define IN V3
|
||||
#define ZERO V4
|
||||
#define T0 V5
|
||||
#define T1 V6
|
||||
#define T2 V7
|
||||
#define XC2 V8
|
||||
#define H V9
|
||||
#define HH V10
|
||||
#define HL V11
|
||||
#define LEMASK V12
|
||||
#define XL1 V13
|
||||
#define XM1 V14
|
||||
#define XH1 V15
|
||||
#define IN1 V16
|
||||
#define H2 V17
|
||||
#define H2H V18
|
||||
#define H2L V19
|
||||
#define XL3 V20
|
||||
#define XM2 V21
|
||||
#define IN2 V22
|
||||
#define H3L V23
|
||||
#define H3 V24
|
||||
#define H3H V25
|
||||
#define XH3 V26
|
||||
#define XM3 V27
|
||||
#define IN3 V28
|
||||
#define H4L V29
|
||||
#define H4 V30
|
||||
#define H4H V31
|
||||
|
||||
#define IN0 IN
|
||||
#define H21L HL
|
||||
#define H21H HH
|
||||
#define LOPERM H2L
|
||||
#define HIPERM H2H
|
||||
|
||||
#define VXL VS32
|
||||
#define VIN VS35
|
||||
#define VXC2 VS40
|
||||
#define VH VS41
|
||||
#define VHH VS42
|
||||
#define VHL VS43
|
||||
#define VIN1 VS48
|
||||
#define VH2 VS49
|
||||
#define VH2H VS50
|
||||
#define VH2L VS51
|
||||
|
||||
#define VIN2 VS54
|
||||
#define VH3L VS55
|
||||
#define VH3 VS56
|
||||
#define VH3H VS57
|
||||
#define VIN3 VS60
|
||||
#define VH4L VS61
|
||||
#define VH4 VS62
|
||||
#define VH4H VS63
|
||||
|
||||
#define VIN0 VIN
|
||||
|
||||
// func gcmInit(productTable *[256]byte, h []byte)
|
||||
TEXT ·gcmInit(SB), NOSPLIT, $0-32
|
||||
MOVD productTable+0(FP), XIP
|
||||
MOVD h+8(FP), HTBL
|
||||
|
||||
MOVD $0x10, R8
|
||||
MOVD $0x20, R9
|
||||
MOVD $0x30, R10
|
||||
LXVD2X (HTBL)(R0), VH // Load H
|
||||
|
||||
VSPLTISB $-16, XC2 // 0xf0
|
||||
VSPLTISB $1, T0 // one
|
||||
VADDUBM XC2, XC2, XC2 // 0xe0
|
||||
VXOR ZERO, ZERO, ZERO
|
||||
VOR XC2, T0, XC2 // 0xe1
|
||||
VSLDOI $15, XC2, ZERO, XC2 // 0xe1...
|
||||
VSLDOI $1, ZERO, T0, T1 // ...1
|
||||
VADDUBM XC2, XC2, XC2 // 0xc2...
|
||||
VSPLTISB $7, T2
|
||||
VOR XC2, T1, XC2 // 0xc2....01
|
||||
VSPLTB $0, H, T1 // most significant byte
|
||||
VSL H, T0, H // H<<=1
|
||||
VSRAB T1, T2, T1 // broadcast carry bit
|
||||
VAND T1, XC2, T1
|
||||
VXOR H, T1, IN // twisted H
|
||||
|
||||
VSLDOI $8, IN, IN, H // twist even more ...
|
||||
VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
|
||||
VSLDOI $8, ZERO, H, HL // ... and split
|
||||
VSLDOI $8, H, ZERO, HH
|
||||
|
||||
STXVD2X VXC2, (XIP+R0) // save pre-computed table
|
||||
STXVD2X VHL, (XIP+R8)
|
||||
MOVD $0x40, R8
|
||||
STXVD2X VH, (XIP+R9)
|
||||
MOVD $0x50, R9
|
||||
STXVD2X VHH, (XIP+R10)
|
||||
MOVD $0x60, R10
|
||||
|
||||
VPMSUMD IN, HL, XL // H.lo·H.lo
|
||||
VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi
|
||||
VPMSUMD IN, HH, XH // H.hi·H.hi
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VXOR XL, T2, XL
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VPMSUMD XL, XC2, XL
|
||||
VXOR T1, XH, T1
|
||||
VXOR XL, T1, IN1
|
||||
|
||||
VSLDOI $8, IN1, IN1, H2
|
||||
VSLDOI $8, ZERO, H2, H2L
|
||||
VSLDOI $8, H2, ZERO, H2H
|
||||
|
||||
STXVD2X VH2L, (XIP+R8) // save H^2
|
||||
MOVD $0x70, R8
|
||||
STXVD2X VH2, (XIP+R9)
|
||||
MOVD $0x80, R9
|
||||
STXVD2X VH2H, (XIP+R10)
|
||||
MOVD $0x90, R10
|
||||
|
||||
VPMSUMD IN, H2L, XL // H.lo·H^2.lo
|
||||
VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
|
||||
VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi
|
||||
VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi
|
||||
VPMSUMD IN, H2H, XH // H.hi·H^2.hi
|
||||
VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
VPMSUMD XL1, XC2, HH // 1st reduction phase
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VSLDOI $8, XM1, ZERO, HL
|
||||
VSLDOI $8, ZERO, XM1, H
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
VXOR XL1, HL, XL1
|
||||
VXOR XH1, H, XH1
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VSLDOI $8, XL1, XL1, XL1
|
||||
VXOR XL, T2, XL
|
||||
VXOR XL1, HH, XL1
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VSLDOI $8, XL1, XL1, H // 2nd reduction phase
|
||||
VPMSUMD XL, XC2, XL
|
||||
VPMSUMD XL1, XC2, XL1
|
||||
VXOR T1, XH, T1
|
||||
VXOR H, XH1, H
|
||||
VXOR XL, T1, XL
|
||||
VXOR XL1, H, XL1
|
||||
|
||||
VSLDOI $8, XL, XL, H
|
||||
VSLDOI $8, XL1, XL1, H2
|
||||
VSLDOI $8, ZERO, H, HL
|
||||
VSLDOI $8, H, ZERO, HH
|
||||
VSLDOI $8, ZERO, H2, H2L
|
||||
VSLDOI $8, H2, ZERO, H2H
|
||||
|
||||
STXVD2X VHL, (XIP+R8) // save H^3
|
||||
MOVD $0xa0, R8
|
||||
STXVD2X VH, (XIP+R9)
|
||||
MOVD $0xb0, R9
|
||||
STXVD2X VHH, (XIP+R10)
|
||||
MOVD $0xc0, R10
|
||||
STXVD2X VH2L, (XIP+R8) // save H^4
|
||||
STXVD2X VH2, (XIP+R9)
|
||||
STXVD2X VH2H, (XIP+R10)
|
||||
|
||||
RET
|
||||
|
||||
// func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
|
||||
TEXT ·gcmHash(SB), NOSPLIT, $0-64
|
||||
MOVD output+0(FP), XIP
|
||||
MOVD productTable+24(FP), HTBL
|
||||
MOVD inp+32(FP), INP
|
||||
MOVD len+56(FP), LEN
|
||||
|
||||
MOVD $0x10, R8
|
||||
MOVD $0x20, R9
|
||||
MOVD $0x30, R10
|
||||
LXVD2X (XIP)(R0), VXL // load Xi
|
||||
|
||||
LXVD2X (HTBL)(R8), VHL // load pre-computed table
|
||||
MOVD $0x40, R8
|
||||
LXVD2X (HTBL)(R9), VH
|
||||
MOVD $0x50, R9
|
||||
LXVD2X (HTBL)(R10), VHH
|
||||
MOVD $0x60, R10
|
||||
LXVD2X (HTBL)(R0), VXC2
|
||||
#ifdef GOARCH_ppc64le
|
||||
LVSL (R0)(R0), LEMASK
|
||||
VSPLTISB $0x07, T0
|
||||
VXOR LEMASK, T0, LEMASK
|
||||
VPERM XL, XL, LEMASK, XL
|
||||
#endif
|
||||
VXOR ZERO, ZERO, ZERO
|
||||
|
||||
CMPU LEN, $64
|
||||
BGE gcm_ghash_p8_4x
|
||||
|
||||
LXVD2X (INP)(R0), VIN
|
||||
ADD $16, INP, INP
|
||||
SUBCCC $16, LEN, LEN
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN, IN, LEMASK, IN
|
||||
#endif
|
||||
VXOR IN, XL, IN
|
||||
BEQ short
|
||||
|
||||
LXVD2X (HTBL)(R8), VH2L // load H^2
|
||||
MOVD $16, R8
|
||||
LXVD2X (HTBL)(R9), VH2
|
||||
ADD LEN, INP, R9 // end of input
|
||||
LXVD2X (HTBL)(R10), VH2H
|
||||
|
||||
loop_2x:
|
||||
LXVD2X (INP)(R0), VIN1
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN1, IN1, LEMASK, IN1
|
||||
#endif
|
||||
|
||||
SUBC $32, LEN, LEN
|
||||
VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo
|
||||
VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo
|
||||
SUBE R11, R11, R11 // borrow?-1:0
|
||||
VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi
|
||||
VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi
|
||||
AND LEN, R11, R11
|
||||
VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi
|
||||
VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi
|
||||
ADD R11, INP, INP
|
||||
|
||||
VXOR XL, XL1, XL
|
||||
VXOR XM, XM1, XM
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VXOR XH, XH1, XH
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VXOR XL, T2, XL
|
||||
LXVD2X (INP)(R8), VIN
|
||||
ADD $32, INP, INP
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VPMSUMD XL, XC2, XL
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN, IN, LEMASK, IN
|
||||
#endif
|
||||
VXOR T1, XH, T1
|
||||
VXOR IN, T1, IN
|
||||
VXOR IN, XL, IN
|
||||
CMP R9, INP
|
||||
BGT loop_2x // done yet?
|
||||
|
||||
CMPWU LEN, $0
|
||||
BNE even
|
||||
|
||||
short:
|
||||
VPMSUMD IN, HL, XL // H.lo·Xi.lo
|
||||
VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
|
||||
VPMSUMD IN, HH, XH // H.hi·Xi.hi
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VXOR XL, T2, XL
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VPMSUMD XL, XC2, XL
|
||||
VXOR T1, XH, T1
|
||||
|
||||
even:
|
||||
VXOR XL, T1, XL
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM XL, XL, LEMASK, XL
|
||||
#endif
|
||||
STXVD2X VXL, (XIP+R0)
|
||||
|
||||
OR R12, R12, R12 // write out Xi
|
||||
RET
|
||||
|
||||
gcm_ghash_p8_4x:
|
||||
LVSL (R8)(R0), T0 // 0x0001..0e0f
|
||||
MOVD $0x70, R8
|
||||
LXVD2X (HTBL)(R9), VH2
|
||||
MOVD $0x80, R9
|
||||
VSPLTISB $8, T1 // 0x0808..0808
|
||||
MOVD $0x90, R10
|
||||
LXVD2X (HTBL)(R8), VH3L // load H^3
|
||||
MOVD $0xa0, R8
|
||||
LXVD2X (HTBL)(R9), VH3
|
||||
MOVD $0xb0, R9
|
||||
LXVD2X (HTBL)(R10), VH3H
|
||||
MOVD $0xc0, R10
|
||||
LXVD2X (HTBL)(R8), VH4L // load H^4
|
||||
MOVD $0x10, R8
|
||||
LXVD2X (HTBL)(R9), VH4
|
||||
MOVD $0x20, R9
|
||||
LXVD2X (HTBL)(R10), VH4H
|
||||
MOVD $0x30, R10
|
||||
|
||||
VSLDOI $8, ZERO, T1, T2 // 0x0000..0808
|
||||
VADDUBM T0, T2, HIPERM // 0x0001..1617
|
||||
VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
|
||||
|
||||
SRD $4, LEN, LEN // this allows to use sign bit as carry
|
||||
|
||||
LXVD2X (INP)(R0), VIN0 // load input
|
||||
LXVD2X (INP)(R8), VIN1
|
||||
SUBCCC $8, LEN, LEN
|
||||
LXVD2X (INP)(R9), VIN2
|
||||
LXVD2X (INP)(R10), VIN3
|
||||
ADD $0x40, INP, INP
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN0, IN0, LEMASK, IN0
|
||||
VPERM IN1, IN1, LEMASK, IN1
|
||||
VPERM IN2, IN2, LEMASK, IN2
|
||||
VPERM IN3, IN3, LEMASK, IN3
|
||||
#endif
|
||||
|
||||
VXOR IN0, XL, XH
|
||||
|
||||
VPMSUMD IN1, H3L, XL1
|
||||
VPMSUMD IN1, H3, XM1
|
||||
VPMSUMD IN1, H3H, XH1
|
||||
|
||||
VPERM H2, H, HIPERM, H21L
|
||||
VPERM IN2, IN3, LOPERM, T0
|
||||
VPERM H2, H, LOPERM, H21H
|
||||
VPERM IN2, IN3, HIPERM, T1
|
||||
VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
|
||||
VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
|
||||
VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
|
||||
VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
|
||||
|
||||
VXOR XM2, XM1, XM2
|
||||
VXOR XL3, XL1, XL3
|
||||
VXOR XM3, XM2, XM3
|
||||
VXOR XH3, XH1, XH3
|
||||
|
||||
BLT tail_4x
|
||||
|
||||
loop_4x:
|
||||
LXVD2X (INP)(R0), VIN0
|
||||
LXVD2X (INP)(R8), VIN1
|
||||
SUBCCC $4, LEN, LEN
|
||||
LXVD2X (INP)(R9), VIN2
|
||||
LXVD2X (INP)(R10), VIN3
|
||||
ADD $0x40, INP, INP
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN1, IN1, LEMASK, IN1
|
||||
VPERM IN2, IN2, LEMASK, IN2
|
||||
VPERM IN3, IN3, LEMASK, IN3
|
||||
VPERM IN0, IN0, LEMASK, IN0
|
||||
#endif
|
||||
|
||||
VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
|
||||
VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
|
||||
VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
|
||||
VPMSUMD IN1, H3L, XL1
|
||||
VPMSUMD IN1, H3, XM1
|
||||
VPMSUMD IN1, H3H, XH1
|
||||
|
||||
VXOR XL, XL3, XL
|
||||
VXOR XM, XM3, XM
|
||||
VXOR XH, XH3, XH
|
||||
VPERM IN2, IN3, LOPERM, T0
|
||||
VPERM IN2, IN3, HIPERM, T1
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
|
||||
VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VXOR XL, T2, XL
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
|
||||
VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
|
||||
VPMSUMD XL, XC2, XL
|
||||
|
||||
VXOR XL3, XL1, XL3
|
||||
VXOR XH3, XH1, XH3
|
||||
VXOR XH, IN0, XH
|
||||
VXOR XM2, XM1, XM2
|
||||
VXOR XH, T1, XH
|
||||
VXOR XM3, XM2, XM3
|
||||
VXOR XH, XL, XH
|
||||
BGE loop_4x
|
||||
|
||||
tail_4x:
|
||||
VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
|
||||
VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
|
||||
VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
|
||||
|
||||
VXOR XL, XL3, XL
|
||||
VXOR XM, XM3, XM
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VXOR XH, XH3, XH
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VXOR XL, T2, XL
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VPMSUMD XL, XC2, XL
|
||||
VXOR T1, XH, T1
|
||||
VXOR XL, T1, XL
|
||||
|
||||
ADDCCC $4, LEN, LEN
|
||||
BEQ done_4x
|
||||
|
||||
LXVD2X (INP)(R0), VIN0
|
||||
CMPU LEN, $2
|
||||
MOVD $-4, LEN
|
||||
BLT one
|
||||
LXVD2X (INP)(R8), VIN1
|
||||
BEQ two
|
||||
|
||||
three:
|
||||
LXVD2X (INP)(R9), VIN2
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN0, IN0, LEMASK, IN0
|
||||
VPERM IN1, IN1, LEMASK, IN1
|
||||
VPERM IN2, IN2, LEMASK, IN2
|
||||
#endif
|
||||
|
||||
VXOR IN0, XL, XH
|
||||
VOR H3L, H3L, H4L
|
||||
VOR H3, H3, H4
|
||||
VOR H3H, H3H, H4H
|
||||
|
||||
VPERM IN1, IN2, LOPERM, T0
|
||||
VPERM IN1, IN2, HIPERM, T1
|
||||
VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
|
||||
VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi
|
||||
VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
|
||||
VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
|
||||
|
||||
VXOR XM3, XM2, XM3
|
||||
JMP tail_4x
|
||||
|
||||
two:
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN0, IN0, LEMASK, IN0
|
||||
VPERM IN1, IN1, LEMASK, IN1
|
||||
#endif
|
||||
|
||||
VXOR IN, XL, XH
|
||||
VPERM ZERO, IN1, LOPERM, T0
|
||||
VPERM ZERO, IN1, HIPERM, T1
|
||||
|
||||
VSLDOI $8, ZERO, H2, H4L
|
||||
VOR H2, H2, H4
|
||||
VSLDOI $8, H2, ZERO, H4H
|
||||
|
||||
VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
|
||||
VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi
|
||||
VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
|
||||
|
||||
JMP tail_4x
|
||||
|
||||
one:
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM IN0, IN0, LEMASK, IN0
|
||||
#endif
|
||||
|
||||
VSLDOI $8, ZERO, H, H4L
|
||||
VOR H, H, H4
|
||||
VSLDOI $8, H, ZERO, H4H
|
||||
|
||||
VXOR IN0, XL, XH
|
||||
VXOR XL3, XL3, XL3
|
||||
VXOR XM3, XM3, XM3
|
||||
VXOR XH3, XH3, XH3
|
||||
|
||||
JMP tail_4x
|
||||
|
||||
done_4x:
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM XL, XL, LEMASK, XL
|
||||
#endif
|
||||
STXVD2X VXL, (XIP+R0) // write out Xi
|
||||
RET
|
||||
|
||||
// func gcmMul(output []byte, productTable *[256]byte)
|
||||
TEXT ·gcmMul(SB), NOSPLIT, $0-32
|
||||
MOVD output+0(FP), XIP
|
||||
MOVD productTable+24(FP), HTBL
|
||||
|
||||
MOVD $0x10, R8
|
||||
MOVD $0x20, R9
|
||||
MOVD $0x30, R10
|
||||
LXVD2X (XIP)(R0), VIN // load Xi
|
||||
|
||||
LXVD2X (HTBL)(R8), VHL // Load pre-computed table
|
||||
LXVD2X (HTBL)(R9), VH
|
||||
LXVD2X (HTBL)(R10), VHH
|
||||
LXVD2X (HTBL)(R0), VXC2
|
||||
#ifdef GOARCH_ppc64le
|
||||
VSPLTISB $0x07, T0
|
||||
VXOR LEMASK, T0, LEMASK
|
||||
VPERM IN, IN, LEMASK, IN
|
||||
#endif
|
||||
VXOR ZERO, ZERO, ZERO
|
||||
|
||||
VPMSUMD IN, HL, XL // H.lo·Xi.lo
|
||||
VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
|
||||
VPMSUMD IN, HH, XH // H.hi·Xi.hi
|
||||
|
||||
VPMSUMD XL, XC2, T2 // 1st reduction phase
|
||||
|
||||
VSLDOI $8, XM, ZERO, T0
|
||||
VSLDOI $8, ZERO, XM, T1
|
||||
VXOR XL, T0, XL
|
||||
VXOR XH, T1, XH
|
||||
|
||||
VSLDOI $8, XL, XL, XL
|
||||
VXOR XL, T2, XL
|
||||
|
||||
VSLDOI $8, XL, XL, T1 // 2nd reduction phase
|
||||
VPMSUMD XL, XC2, XL
|
||||
VXOR T1, XH, T1
|
||||
VXOR XL, T1, XL
|
||||
|
||||
#ifdef GOARCH_ppc64le
|
||||
VPERM XL, XL, LEMASK, XL
|
||||
#endif
|
||||
STXVD2X VXL, (XIP+R0) // write out Xi
|
||||
RET
|
Loading…
x
Reference in New Issue
Block a user