mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/internal/obj/loong64: fix the usage of offset in the instructions [X]VLDREPL.{B/H/W/D}
The previously defined usage of offset was ambiguous and not easy to understand. For example, to fetch 4 bytes of data from the address base+8 and broadcast it to each word element of vector register V5, the assembly implementation is as follows: previous: VMOVQ 2(base), V5.W4 current: VMOVQ 8(base), V5.W4 Change-Id: I8bc84e35033ab63bd10f4c61618789f94314f78c Reviewed-on: https://go-review.googlesource.com/c/go/+/699875 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
8c27a80890
commit
b8cc907425
4 changed files with 75 additions and 17 deletions
28
src/cmd/asm/internal/asm/testdata/loong64enc1.s
vendored
28
src/cmd/asm/internal/asm/testdata/loong64enc1.s
vendored
|
|
@ -538,13 +538,29 @@ lable2:
|
||||||
|
|
||||||
// Load data from memory and broadcast to each element of a vector register: VMOVQ offset(Rj), <Vd>.<T>
|
// Load data from memory and broadcast to each element of a vector register: VMOVQ offset(Rj), <Vd>.<T>
|
||||||
VMOVQ (R4), V0.B16 // 80008030
|
VMOVQ (R4), V0.B16 // 80008030
|
||||||
VMOVQ 1(R4), V1.H8 // 81044030
|
VMOVQ 1(R4), V0.B16 // 80048030
|
||||||
VMOVQ 2(R4), V2.W4 // 82082030
|
VMOVQ -3(R4), V0.B16 // 80f4bf30
|
||||||
VMOVQ 3(R4), V3.V2 // 830c1030
|
VMOVQ (R4), V1.H8 // 81004030
|
||||||
|
VMOVQ 2(R4), V1.H8 // 81044030
|
||||||
|
VMOVQ -6(R4), V1.H8 // 81f45f30
|
||||||
|
VMOVQ (R4), V2.W4 // 82002030
|
||||||
|
VMOVQ 8(R4), V2.W4 // 82082030
|
||||||
|
VMOVQ -12(R4), V2.W4 // 82f42f30
|
||||||
|
VMOVQ (R4), V3.V2 // 83001030
|
||||||
|
VMOVQ 24(R4), V3.V2 // 830c1030
|
||||||
|
VMOVQ -16(R4), V3.V2 // 83f81730
|
||||||
XVMOVQ (R4), X0.B32 // 80008032
|
XVMOVQ (R4), X0.B32 // 80008032
|
||||||
XVMOVQ 1(R4), X1.H16 // 81044032
|
XVMOVQ 1(R4), X0.B32 // 80048032
|
||||||
XVMOVQ 2(R4), X2.W8 // 82082032
|
XVMOVQ -5(R4), X0.B32 // 80ecbf32
|
||||||
XVMOVQ 3(R4), X3.V4 // 830c1032
|
XVMOVQ (R4), X1.H16 // 81004032
|
||||||
|
XVMOVQ 2(R4), X1.H16 // 81044032
|
||||||
|
XVMOVQ -10(R4), X1.H16 // 81ec5f32
|
||||||
|
XVMOVQ (R4), X2.W8 // 82002032
|
||||||
|
XVMOVQ 8(R4), X2.W8 // 82082032
|
||||||
|
XVMOVQ -20(R4), X2.W8 // 82ec2f32
|
||||||
|
XVMOVQ (R4), X3.V4 // 83001032
|
||||||
|
XVMOVQ 24(R4), X3.V4 // 830c1032
|
||||||
|
XVMOVQ -24(R4), X3.V4 // 83f41732
|
||||||
|
|
||||||
// VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
|
// VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
|
||||||
VSEQB V1, V2, V3 // 43040070
|
VSEQB V1, V2, V3 // 43040070
|
||||||
|
|
|
||||||
|
|
@ -1983,6 +1983,18 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
|
||||||
return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func OP_11IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
|
||||||
|
return op | (i&0x7FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
||||||
|
}
|
||||||
|
|
||||||
|
func OP_10IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
|
||||||
|
return op | (i&0x3FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
||||||
|
}
|
||||||
|
|
||||||
|
func OP_9IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
|
||||||
|
return op | (i&0x1FF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
||||||
|
}
|
||||||
|
|
||||||
func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
|
func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
|
||||||
return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
|
||||||
}
|
}
|
||||||
|
|
@ -2535,7 +2547,28 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
|
||||||
si := c.regoff(&p.From)
|
si := c.regoff(&p.From)
|
||||||
Rj := uint32(p.From.Reg & EXT_REG_MASK)
|
Rj := uint32(p.From.Reg & EXT_REG_MASK)
|
||||||
Vd := uint32(p.To.Reg & EXT_REG_MASK)
|
Vd := uint32(p.To.Reg & EXT_REG_MASK)
|
||||||
o1 = v | uint32(si<<10) | (Rj << 5) | Vd
|
switch v & 0xc00000 {
|
||||||
|
case 0x800000: // [x]vldrepl.b
|
||||||
|
o1 = OP_12IRR(v, uint32(si), Rj, Vd)
|
||||||
|
case 0x400000: // [x]vldrepl.h
|
||||||
|
if si&1 != 0 {
|
||||||
|
c.ctxt.Diag("%v: offset must be a multiple of 2.\n", p)
|
||||||
|
}
|
||||||
|
o1 = OP_11IRR(v, uint32(si>>1), Rj, Vd)
|
||||||
|
case 0x0:
|
||||||
|
switch v & 0x300000 {
|
||||||
|
case 0x200000: // [x]vldrepl.w
|
||||||
|
if si&3 != 0 {
|
||||||
|
c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
|
||||||
|
}
|
||||||
|
o1 = OP_10IRR(v, uint32(si>>2), Rj, Vd)
|
||||||
|
case 0x100000: // [x]vldrepl.d
|
||||||
|
if si&7 != 0 {
|
||||||
|
c.ctxt.Diag("%v: offset must be a multiple of 8.\n", p)
|
||||||
|
}
|
||||||
|
o1 = OP_9IRR(v, uint32(si>>3), Rj, Vd)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
case 47: // preld offset(Rbase), $hint
|
case 47: // preld offset(Rbase), $hint
|
||||||
offs := c.regoff(&p.From)
|
offs := c.regoff(&p.From)
|
||||||
|
|
|
||||||
|
|
@ -220,6 +220,15 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
|
||||||
XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
|
XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
|
||||||
XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
|
XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
|
||||||
|
|
||||||
|
note: In Go assembly, for ease of understanding, offset representing the actual address offset.
|
||||||
|
However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
|
||||||
|
|
||||||
|
Go assembly | platform assembly
|
||||||
|
VMOVQ 1(R4), V5.B16 | vldrepl.b v5, r4, $1
|
||||||
|
VMOVQ 2(R4), V5.H8 | vldrepl.h v5, r4, $1
|
||||||
|
VMOVQ 8(R4), V5.W4 | vldrepl.w v5, r4, $2
|
||||||
|
VMOVQ 8(R4), V5.V2 | vldrepl.d v5, r4, $1
|
||||||
|
|
||||||
# Special instruction encoding definition and description on LoongArch
|
# Special instruction encoding definition and description on LoongArch
|
||||||
|
|
||||||
1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
|
1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
|
||||||
|
|
|
||||||
|
|
@ -50,22 +50,22 @@ lsx_chacha8:
|
||||||
|
|
||||||
// load contants
|
// load contants
|
||||||
VMOVQ (R10), V0.W4
|
VMOVQ (R10), V0.W4
|
||||||
VMOVQ 1(R10), V1.W4
|
VMOVQ 4(R10), V1.W4
|
||||||
VMOVQ 2(R10), V2.W4
|
VMOVQ 8(R10), V2.W4
|
||||||
VMOVQ 3(R10), V3.W4
|
VMOVQ 12(R10), V3.W4
|
||||||
|
|
||||||
// load 4-32bit data from incRotMatrix added to counter
|
// load 4-32bit data from incRotMatrix added to counter
|
||||||
VMOVQ (R11), V30
|
VMOVQ (R11), V30
|
||||||
|
|
||||||
// load seed
|
// load seed
|
||||||
VMOVQ (R4), V4.W4
|
VMOVQ (R4), V4.W4
|
||||||
VMOVQ 1(R4), V5.W4
|
VMOVQ 4(R4), V5.W4
|
||||||
VMOVQ 2(R4), V6.W4
|
VMOVQ 8(R4), V6.W4
|
||||||
VMOVQ 3(R4), V7.W4
|
VMOVQ 12(R4), V7.W4
|
||||||
VMOVQ 4(R4), V8.W4
|
VMOVQ 16(R4), V8.W4
|
||||||
VMOVQ 5(R4), V9.W4
|
VMOVQ 20(R4), V9.W4
|
||||||
VMOVQ 6(R4), V10.W4
|
VMOVQ 24(R4), V10.W4
|
||||||
VMOVQ 7(R4), V11.W4
|
VMOVQ 28(R4), V11.W4
|
||||||
|
|
||||||
// load counter and update counter
|
// load counter and update counter
|
||||||
VMOVQ R6, V12.W4
|
VMOVQ R6, V12.W4
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue