cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL.{B/H/W/D}

The previously defined usage of offset was ambiguous and not easy to understand.
For example, to fetch 4 bytes of data from the address base+8 and
broadcast it to each word element of vector register V5, the assembly
implementation is as follows:
	previous: VMOVQ 2(base), V5.W4
	current:  VMOVQ 8(base), V5.W4

Change-Id: I8bc84e35033ab63bd10f4c61618789f94314f78c
Reviewed-on: https://go-review.googlesource.com/c/go/+/699875
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Xiaolin Zhao 2025-08-29 16:20:16 +08:00 committed by Gopher Robot
parent 8c27a80890
commit b8cc907425
4 changed files with 75 additions and 17 deletions

View file

@ -538,13 +538,29 @@ lable2:
// Load data from memory and broadcast to each element of a vector register: VMOVQ offset(Rj), <Vd>.<T>
VMOVQ (R4), V0.B16 // 80008030
VMOVQ 1(R4), V1.H8 // 81044030
VMOVQ 2(R4), V2.W4 // 82082030
VMOVQ 3(R4), V3.V2 // 830c1030
VMOVQ 1(R4), V0.B16 // 80048030
VMOVQ -3(R4), V0.B16 // 80f4bf30
VMOVQ (R4), V1.H8 // 81004030
VMOVQ 2(R4), V1.H8 // 81044030
VMOVQ -6(R4), V1.H8 // 81f45f30
VMOVQ (R4), V2.W4 // 82002030
VMOVQ 8(R4), V2.W4 // 82082030
VMOVQ -12(R4), V2.W4 // 82f42f30
VMOVQ (R4), V3.V2 // 83001030
VMOVQ 24(R4), V3.V2 // 830c1030
VMOVQ -16(R4), V3.V2 // 83f81730
XVMOVQ (R4), X0.B32 // 80008032
XVMOVQ 1(R4), X1.H16 // 81044032
XVMOVQ 2(R4), X2.W8 // 82082032
XVMOVQ 3(R4), X3.V4 // 830c1032
XVMOVQ 1(R4), X0.B32 // 80048032
XVMOVQ -5(R4), X0.B32 // 80ecbf32
XVMOVQ (R4), X1.H16 // 81004032
XVMOVQ 2(R4), X1.H16 // 81044032
XVMOVQ -10(R4), X1.H16 // 81ec5f32
XVMOVQ (R4), X2.W8 // 82002032
XVMOVQ 8(R4), X2.W8 // 82082032
XVMOVQ -20(R4), X2.W8 // 82ec2f32
XVMOVQ (R4), X3.V4 // 83001032
XVMOVQ 24(R4), X3.V4 // 830c1032
XVMOVQ -24(R4), X3.V4 // 83f41732
// VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
VSEQB V1, V2, V3 // 43040070

View file

@ -1983,6 +1983,18 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
func OP_11IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0x7FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
func OP_10IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0x3FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
func OP_9IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0x1FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
}
@ -2535,7 +2547,28 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
si := c.regoff(&p.From)
Rj := uint32(p.From.Reg & EXT_REG_MASK)
Vd := uint32(p.To.Reg & EXT_REG_MASK)
o1 = v | uint32(si<<10) | (Rj << 5) | Vd
switch v & 0xc00000 {
case 0x800000: // [x]vldrepl.b
o1 = OP_12IRR(v, uint32(si), Rj, Vd)
case 0x400000: // [x]vldrepl.h
if si&1 != 0 {
c.ctxt.Diag("%v: offset must be a multiple of 2.\n", p)
}
o1 = OP_11IRR(v, uint32(si>>1), Rj, Vd)
case 0x0:
switch v & 0x300000 {
case 0x200000: // [x]vldrepl.w
if si&3 != 0 {
c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
}
o1 = OP_10IRR(v, uint32(si>>2), Rj, Vd)
case 0x100000: // [x]vldrepl.d
if si&7 != 0 {
c.ctxt.Diag("%v: offset must be a multiple of 8.\n", p)
}
o1 = OP_9IRR(v, uint32(si>>3), Rj, Vd)
}
}
case 47: // preld offset(Rbase), $hint
offs := c.regoff(&p.From)

View file

@ -220,6 +220,15 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
note: In Go assembly, for ease of understanding, offset representing the actual address offset.
However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
Go assembly | platform assembly
VMOVQ 1(R4), V5.B16 | vldrepl.b v5, r4, $1
VMOVQ 2(R4), V5.H8 | vldrepl.h v5, r4, $1
VMOVQ 8(R4), V5.W4 | vldrepl.w v5, r4, $2
VMOVQ 8(R4), V5.V2 | vldrepl.d v5, r4, $1
# Special instruction encoding definition and description on LoongArch
1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased

View file

@ -50,22 +50,22 @@ lsx_chacha8:
// load contants
VMOVQ (R10), V0.W4
VMOVQ 1(R10), V1.W4
VMOVQ 2(R10), V2.W4
VMOVQ 3(R10), V3.W4
VMOVQ 4(R10), V1.W4
VMOVQ 8(R10), V2.W4
VMOVQ 12(R10), V3.W4
// load 4-32bit data from incRotMatrix added to counter
VMOVQ (R11), V30
// load seed
VMOVQ (R4), V4.W4
VMOVQ 1(R4), V5.W4
VMOVQ 2(R4), V6.W4
VMOVQ 3(R4), V7.W4
VMOVQ 4(R4), V8.W4
VMOVQ 5(R4), V9.W4
VMOVQ 6(R4), V10.W4
VMOVQ 7(R4), V11.W4
VMOVQ 4(R4), V5.W4
VMOVQ 8(R4), V6.W4
VMOVQ 12(R4), V7.W4
VMOVQ 16(R4), V8.W4
VMOVQ 20(R4), V9.W4
VMOVQ 24(R4), V10.W4
VMOVQ 28(R4), V11.W4
// load counter and update counter
VMOVQ R6, V12.W4