internal/chacha8rand: provide vector implementation for riscv64

Provide a vector implementation of chacha8rand for riscv64,
which improves performance.

goos: linux
goarch: riscv64
pkg: internal/chacha8rand
cpu: Spacemit(R) X60
      │ /root/chacha8.rand.old.log │     /root/chacha8.rand.new.log      │
      │           sec/op           │   sec/op     vs base                │
Block                  1.640µ ± 0%   1.294µ ± 0%  -21.10% (p=0.000 n=10)

      │ /root/chacha8.rand.old.log │      /root/chacha8.rand.new.log      │
      │            B/s             │     B/s       vs base                │
Block                 148.9Mi ± 0%   188.6Mi ± 0%  +26.72% (p=0.000 n=10)

Change-Id: I1e04c5c44e5ce0c78814a6a48c5ab65e4d758937
Reviewed-on: https://go-review.googlesource.com/c/go/+/710035
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Meng Zhuo 2025-10-03 18:36:01 +08:00
parent 54e3adc533
commit 6f04a92be3
3 changed files with 115 additions and 1 deletions

View file

@ -16,6 +16,7 @@ import (
// Offsets into internal/cpu records for use in assembly.
const (
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV)
)
const (

View file

@ -0,0 +1,113 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "asm_riscv64.h"
#include "go_asm.h"
#include "textflag.h"
// TODO(mzh): use Zvkb if possible
#define QR(A, B, C, D) \
VADDVV A, B, A \
VXORVV D, A, D \
VSLLVI $16, D, V28 \
VSRLVI $16, D, D \
VXORVV V28, D, D \
VADDVV D, C, C \
VXORVV C, B, B \
VSLLVI $12, B, V29 \
VSRLVI $20, B, B \
VXORVV V29, B, B \
VADDVV B, A, A \
VXORVV A, D, D \
VSLLVI $8, D, V30 \
VSRLVI $24, D, D \
VXORVV V30, D, D \
VADDVV D, C, C \
VXORVV C, B, B \
VSLLVI $7, B, V31 \
VSRLVI $25, B, B \
VXORVV V31, B, B
// block runs four ChaCha8 block transformations using four elements in each V register.
// func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
// seed in X10
// blocks in X11
// counter in X12
#ifndef hasV
MOVB internalcpu·RISCV64+const_offsetRISCV64HasV(SB), X13
BNEZ X13, vector_chacha8
JMP ·block_generic<ABIInternal>(SB)
#endif
vector_chacha8:
// At least VLEN >= 128
VSETIVLI $4, E32, M1, TA, MA, X0
// Load initial constants into top row.
MOV $·chachaConst(SB), X14
VLSSEG4E32V (X14), X0, V0 // V0, V1, V2, V3 = const row
VLSSEG8E32V (X10), X0, V4 // V4 ... V11, seed
VIDV V12
VADDVX X12, V12, V12 // counter
// Clear all nonces.
VXORVV V13, V13, V13
VXORVV V14, V14, V14
VXORVV V15, V15, V15
// Copy initial state.
VMV4RV V4, V20
VMV4RV V8, V24
MOV $4, X15
PCALIGN $16
loop:
QR(V0, V4, V8, V12)
QR(V1, V5, V9, V13)
QR(V2, V6, V10, V14)
QR(V3, V7, V11, V15)
QR(V0, V5, V10, V15)
QR(V1, V6, V11, V12)
QR(V2, V7, V8, V13)
QR(V3, V4, V9, V14)
SUB $1, X15
BNEZ X15, loop
VADDVV V20, V4, V4
VADDVV V21, V5, V5
VADDVV V22, V6, V6
VADDVV V23, V7, V7
VADDVV V24, V8, V8
VADDVV V25, V9, V9
VADDVV V26, V10, V10
VADDVV V27, V11, V11
VSE32V V0, (X11); ADD $16, X11;
VSE32V V1, (X11); ADD $16, X11;
VSE32V V2, (X11); ADD $16, X11;
VSE32V V3, (X11); ADD $16, X11;
VSE32V V4, (X11); ADD $16, X11;
VSE32V V5, (X11); ADD $16, X11;
VSE32V V6, (X11); ADD $16, X11;
VSE32V V7, (X11); ADD $16, X11;
VSE32V V8, (X11); ADD $16, X11;
VSE32V V9, (X11); ADD $16, X11;
VSE32V V10, (X11); ADD $16, X11;
VSE32V V11, (X11); ADD $16, X11;
VSE32V V12, (X11); ADD $16, X11;
VSE32V V13, (X11); ADD $16, X11;
VSE32V V14, (X11); ADD $16, X11;
VSE32V V15, (X11); ADD $16, X11;
RET
GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
DATA ·chachaConst+0x00(SB)/4, $0x61707865
DATA ·chachaConst+0x04(SB)/4, $0x3320646e
DATA ·chachaConst+0x08(SB)/4, $0x79622d32
DATA ·chachaConst+0x0c(SB)/4, $0x6b206574

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 && !arm64 && !loong64
//go:build !amd64 && !arm64 && !loong64 && !riscv64
#include "textflag.h"