mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
This CL generates optimizations for masked variant of AVX512 instructions for patterns: x.Op(y).Merge(z, mask) => OpMasked(z, x, y mask), where OpMasked is resultInArg0. Change-Id: Ife7ccc9ddbf76ae921a085bd6a42b965da9bc179 Reviewed-on: https://go-review.googlesource.com/c/go/+/718160 Reviewed-by: David Chase <drchase@google.com> TryBot-Bypass: Junyang Shao <shaojunyang@google.com>
2578 lines
77 KiB
Go
2578 lines
77 KiB
Go
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package amd64
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
|
|
"cmd/compile/internal/base"
|
|
"cmd/compile/internal/ir"
|
|
"cmd/compile/internal/logopt"
|
|
"cmd/compile/internal/objw"
|
|
"cmd/compile/internal/ssa"
|
|
"cmd/compile/internal/ssagen"
|
|
"cmd/compile/internal/types"
|
|
"cmd/internal/obj"
|
|
"cmd/internal/obj/x86"
|
|
"internal/abi"
|
|
"internal/buildcfg"
|
|
)
|
|
|
|
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
|
|
func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
|
|
flive := b.FlagsLiveAtEnd
|
|
for _, c := range b.ControlValues() {
|
|
flive = c.Type.IsFlags() || flive
|
|
}
|
|
for i := len(b.Values) - 1; i >= 0; i-- {
|
|
v := b.Values[i]
|
|
if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
|
|
// The "mark" is any non-nil Aux value.
|
|
v.Aux = ssa.AuxMark
|
|
}
|
|
if v.Type.IsFlags() {
|
|
flive = false
|
|
}
|
|
for _, a := range v.Args {
|
|
if a.Type.IsFlags() {
|
|
flive = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func isFPReg(r int16) bool {
|
|
return x86.REG_X0 <= r && r <= x86.REG_Z31
|
|
}
|
|
|
|
func isKReg(r int16) bool {
|
|
return x86.REG_K0 <= r && r <= x86.REG_K7
|
|
}
|
|
|
|
func isLowFPReg(r int16) bool {
|
|
return x86.REG_X0 <= r && r <= x86.REG_X15
|
|
}
|
|
|
|
// loadByRegWidth returns the load instruction of the given register of a given width.
|
|
func loadByRegWidth(r int16, width int64) obj.As {
|
|
// Avoid partial register write for GPR
|
|
if !isFPReg(r) && !isKReg(r) {
|
|
switch width {
|
|
case 1:
|
|
return x86.AMOVBLZX
|
|
case 2:
|
|
return x86.AMOVWLZX
|
|
}
|
|
}
|
|
// Otherwise, there's no difference between load and store opcodes.
|
|
return storeByRegWidth(r, width)
|
|
}
|
|
|
|
// storeByRegWidth returns the store instruction of the given register of a given width.
|
|
// It's also used for loading const to a reg.
|
|
func storeByRegWidth(r int16, width int64) obj.As {
|
|
if isFPReg(r) {
|
|
switch width {
|
|
case 4:
|
|
return x86.AMOVSS
|
|
case 8:
|
|
return x86.AMOVSD
|
|
case 16:
|
|
// int128s are in SSE registers
|
|
if isLowFPReg(r) {
|
|
return x86.AMOVUPS
|
|
} else {
|
|
return x86.AVMOVDQU
|
|
}
|
|
case 32:
|
|
return x86.AVMOVDQU
|
|
case 64:
|
|
return x86.AVMOVDQU64
|
|
}
|
|
}
|
|
if isKReg(r) {
|
|
return x86.AKMOVQ
|
|
}
|
|
// gp
|
|
switch width {
|
|
case 1:
|
|
return x86.AMOVB
|
|
case 2:
|
|
return x86.AMOVW
|
|
case 4:
|
|
return x86.AMOVL
|
|
case 8:
|
|
return x86.AMOVQ
|
|
}
|
|
panic(fmt.Sprintf("bad store reg=%v, width=%d", r, width))
|
|
}
|
|
|
|
// moveByRegsWidth returns the reg->reg move instruction of the given dest/src registers of a given width.
|
|
func moveByRegsWidth(dest, src int16, width int64) obj.As {
|
|
// fp -> fp
|
|
if isFPReg(dest) && isFPReg(src) {
|
|
// Moving the whole sse2 register is faster
|
|
// than moving just the correct low portion of it.
|
|
// There is no xmm->xmm move with 1 byte opcode,
|
|
// so use movups, which has 2 byte opcode.
|
|
if isLowFPReg(dest) && isLowFPReg(src) && width <= 16 {
|
|
return x86.AMOVUPS
|
|
}
|
|
if width <= 32 {
|
|
return x86.AVMOVDQU
|
|
}
|
|
return x86.AVMOVDQU64
|
|
}
|
|
// k -> gp, gp -> k, k -> k
|
|
if isKReg(dest) || isKReg(src) {
|
|
if isFPReg(dest) || isFPReg(src) {
|
|
panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
|
|
}
|
|
return x86.AKMOVQ
|
|
}
|
|
// gp -> fp, fp -> gp, gp -> gp
|
|
switch width {
|
|
case 1:
|
|
// Avoids partial register write
|
|
return x86.AMOVL
|
|
case 2:
|
|
return x86.AMOVL
|
|
case 4:
|
|
return x86.AMOVL
|
|
case 8:
|
|
return x86.AMOVQ
|
|
case 16:
|
|
if isLowFPReg(dest) && isLowFPReg(src) {
|
|
// int128s are in SSE registers
|
|
return x86.AMOVUPS
|
|
} else {
|
|
return x86.AVMOVDQU
|
|
}
|
|
case 32:
|
|
return x86.AVMOVDQU
|
|
case 64:
|
|
return x86.AVMOVDQU64
|
|
}
|
|
panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
|
|
}
|
|
|
|
// opregreg emits instructions for
|
|
//
|
|
// dest := dest(To) op src(From)
|
|
//
|
|
// and also returns the created obj.Prog so it
|
|
// may be further adjusted (offset, scale, etc).
|
|
func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
|
|
p := s.Prog(op)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = dest
|
|
p.From.Reg = src
|
|
return p
|
|
}
|
|
|
|
// memIdx fills out a as an indexed memory reference for v.
|
|
// It assumes that the base register and the index register
|
|
// are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
|
|
// The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
|
|
func memIdx(a *obj.Addr, v *ssa.Value) {
|
|
r, i := v.Args[0].Reg(), v.Args[1].Reg()
|
|
a.Type = obj.TYPE_MEM
|
|
a.Scale = v.Op.Scale()
|
|
if a.Scale == 1 && i == x86.REG_SP {
|
|
r, i = i, r
|
|
}
|
|
a.Reg = r
|
|
a.Index = i
|
|
}
|
|
|
|
func getgFromTLS(s *ssagen.State, r int16) {
|
|
// See the comments in cmd/internal/obj/x86/obj6.go
|
|
// near CanUse1InsnTLS for a detailed explanation of these instructions.
|
|
if x86.CanUse1InsnTLS(base.Ctxt) {
|
|
// MOVQ (TLS), r
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = x86.REG_TLS
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
} else {
|
|
// MOVQ TLS, r
|
|
// MOVQ (r)(TLS*1), r
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = x86.REG_TLS
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
q := s.Prog(x86.AMOVQ)
|
|
q.From.Type = obj.TYPE_MEM
|
|
q.From.Reg = r
|
|
q.From.Index = x86.REG_TLS
|
|
q.From.Scale = 1
|
|
q.To.Type = obj.TYPE_REG
|
|
q.To.Reg = r
|
|
}
|
|
}
|
|
|
|
func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|
switch v.Op {
|
|
case ssa.OpAMD64VFMADD231SD, ssa.OpAMD64VFMADD231SS:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
|
|
p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
|
|
p.AddRestSourceReg(v.Args[1].Reg())
|
|
case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
|
|
r := v.Reg()
|
|
r1 := v.Args[0].Reg()
|
|
r2 := v.Args[1].Reg()
|
|
switch {
|
|
case r == r1:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r2
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
case r == r2:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r1
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
default:
|
|
var asm obj.As
|
|
if v.Op == ssa.OpAMD64ADDQ {
|
|
asm = x86.ALEAQ
|
|
} else {
|
|
asm = x86.ALEAL
|
|
}
|
|
p := s.Prog(asm)
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = r1
|
|
p.From.Scale = 1
|
|
p.From.Index = r2
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
}
|
|
// 2-address opcode arithmetic
|
|
case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
|
|
ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
|
|
ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
|
|
ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
|
|
ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
|
|
ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
|
|
ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
|
|
ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
|
|
ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
|
|
ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
|
|
ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
|
|
ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
|
|
ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
|
|
ssa.OpAMD64POR, ssa.OpAMD64PXOR,
|
|
ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
|
|
ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
|
|
ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
|
|
ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
|
|
ssa.OpAMD64PUNPCKLBW:
|
|
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
|
|
|
|
case ssa.OpAMD64PSHUFLW:
|
|
p := s.Prog(v.Op.Asm())
|
|
imm := v.AuxInt
|
|
if imm < 0 || imm > 255 {
|
|
v.Fatalf("Invalid source selection immediate")
|
|
}
|
|
p.From.Offset = imm
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(v.Args[0].Reg())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64PSHUFBbroadcast:
|
|
// PSHUFB with a control mask of zero copies byte 0 to all
|
|
// bytes in the register.
|
|
//
|
|
// X15 is always zero with ABIInternal.
|
|
if s.ABI != obj.ABIInternal {
|
|
// zero X15 manually
|
|
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
|
}
|
|
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
p.From.Reg = x86.REG_X15
|
|
|
|
case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
|
|
p := s.Prog(v.Op.Asm())
|
|
lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = bits
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = lo
|
|
p.AddRestSourceReg(hi)
|
|
|
|
case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
|
|
ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
|
|
ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
switch v.Op {
|
|
case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
|
|
p.To.Reg = v.Reg0()
|
|
default:
|
|
p.To.Reg = v.Reg()
|
|
}
|
|
|
|
case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
p.AddRestSourceReg(v.Args[1].Reg())
|
|
|
|
case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
|
|
ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
|
|
ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
|
|
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
|
|
p.AddRestSourceReg(v.Args[0].Reg())
|
|
|
|
case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
|
|
ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
|
|
ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
|
|
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
|
|
ssagen.AddAux(&m, v)
|
|
p.AddRestSource(m)
|
|
|
|
case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
|
|
ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
|
|
ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
|
|
ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
|
|
ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
|
|
ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
|
|
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
|
|
m := obj.Addr{Type: obj.TYPE_MEM}
|
|
memIdx(&m, v)
|
|
ssagen.AddAux(&m, v)
|
|
p.AddRestSource(m)
|
|
|
|
case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
|
|
// Arg[0] (the dividend) is in AX.
|
|
// Arg[1] (the divisor) can be in any other register.
|
|
// Result[0] (the quotient) is in AX.
|
|
// Result[1] (the remainder) is in DX.
|
|
r := v.Args[1].Reg()
|
|
|
|
// Zero extend dividend.
|
|
opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
|
|
|
|
// Issue divide.
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r
|
|
|
|
case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
|
|
// Arg[0] (the dividend) is in AX.
|
|
// Arg[1] (the divisor) can be in any other register.
|
|
// Result[0] (the quotient) is in AX.
|
|
// Result[1] (the remainder) is in DX.
|
|
r := v.Args[1].Reg()
|
|
|
|
var opCMP, opNEG, opSXD obj.As
|
|
switch v.Op {
|
|
case ssa.OpAMD64DIVQ:
|
|
opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
|
|
case ssa.OpAMD64DIVL:
|
|
opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
|
|
case ssa.OpAMD64DIVW:
|
|
opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
|
|
}
|
|
|
|
// CPU faults upon signed overflow, which occurs when the most
|
|
// negative int is divided by -1. Handle divide by -1 as a special case.
|
|
var j1, j2 *obj.Prog
|
|
if ssa.DivisionNeedsFixUp(v) {
|
|
c := s.Prog(opCMP)
|
|
c.From.Type = obj.TYPE_REG
|
|
c.From.Reg = r
|
|
c.To.Type = obj.TYPE_CONST
|
|
c.To.Offset = -1
|
|
|
|
// Divisor is not -1, proceed with normal division.
|
|
j1 = s.Prog(x86.AJNE)
|
|
j1.To.Type = obj.TYPE_BRANCH
|
|
|
|
// Divisor is -1, manually compute quotient and remainder via fixup code.
|
|
// n / -1 = -n
|
|
n1 := s.Prog(opNEG)
|
|
n1.To.Type = obj.TYPE_REG
|
|
n1.To.Reg = x86.REG_AX
|
|
|
|
// n % -1 == 0
|
|
opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
|
|
|
|
// TODO(khr): issue only the -1 fixup code we need.
|
|
// For instance, if only the quotient is used, no point in zeroing the remainder.
|
|
|
|
// Skip over normal division.
|
|
j2 = s.Prog(obj.AJMP)
|
|
j2.To.Type = obj.TYPE_BRANCH
|
|
}
|
|
|
|
// Sign extend dividend and perform division.
|
|
p := s.Prog(opSXD)
|
|
if j1 != nil {
|
|
j1.To.SetTarget(p)
|
|
}
|
|
p = s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r
|
|
|
|
if j2 != nil {
|
|
j2.To.SetTarget(s.Pc())
|
|
}
|
|
|
|
case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
|
|
// the frontend rewrites constant division by 8/16/32 bit integers into
|
|
// HMUL by a constant
|
|
// SSA rewrites generate the 64 bit versions
|
|
|
|
// Arg[0] is already in AX as it's the only register we allow
|
|
// and DX is the only output we care about (the high bits)
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
|
|
// IMULB puts the high portion in AH instead of DL,
|
|
// so move it to DL for consistency
|
|
if v.Type.Size() == 1 {
|
|
m := s.Prog(x86.AMOVB)
|
|
m.From.Type = obj.TYPE_REG
|
|
m.From.Reg = x86.REG_AH
|
|
m.To.Type = obj.TYPE_REG
|
|
m.To.Reg = x86.REG_DX
|
|
}
|
|
|
|
case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
|
|
// Arg[0] is already in AX as it's the only register we allow
|
|
// results lo in AX
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
|
|
case ssa.OpAMD64MULQU2:
|
|
// Arg[0] is already in AX as it's the only register we allow
|
|
// results hi in DX, lo in AX
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
|
|
case ssa.OpAMD64DIVQU2:
|
|
// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
|
|
// results q in AX, r in DX
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[2].Reg()
|
|
|
|
case ssa.OpAMD64AVGQU:
|
|
// compute (x+y)/2 unsigned.
|
|
// Do a 64-bit add, the overflow goes into the carry.
|
|
// Shift right once and pull the carry back into the 63rd bit.
|
|
p := s.Prog(x86.AADDQ)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p = s.Prog(x86.ARCRQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = 1
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
|
|
r := v.Reg0()
|
|
r0 := v.Args[0].Reg()
|
|
r1 := v.Args[1].Reg()
|
|
switch r {
|
|
case r0:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r1
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
case r1:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r0
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
default:
|
|
v.Fatalf("output not in same register as an input %s", v.LongString())
|
|
}
|
|
|
|
case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg0()
|
|
|
|
case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg0()
|
|
|
|
case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
|
|
r := v.Reg()
|
|
a := v.Args[0].Reg()
|
|
if r == a {
|
|
switch v.AuxInt {
|
|
case 1:
|
|
var asm obj.As
|
|
// Software optimization manual recommends add $1,reg.
|
|
// But inc/dec is 1 byte smaller. ICC always uses inc
|
|
// Clang/GCC choose depending on flags, but prefer add.
|
|
// Experiments show that inc/dec is both a little faster
|
|
// and make a binary a little smaller.
|
|
if v.Op == ssa.OpAMD64ADDQconst {
|
|
asm = x86.AINCQ
|
|
} else {
|
|
asm = x86.AINCL
|
|
}
|
|
p := s.Prog(asm)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
return
|
|
case -1:
|
|
var asm obj.As
|
|
if v.Op == ssa.OpAMD64ADDQconst {
|
|
asm = x86.ADECQ
|
|
} else {
|
|
asm = x86.ADECL
|
|
}
|
|
p := s.Prog(asm)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
return
|
|
case 0x80:
|
|
// 'SUBQ $-0x80, r' is shorter to encode than
|
|
// and functionally equivalent to 'ADDQ $0x80, r'.
|
|
asm := x86.ASUBL
|
|
if v.Op == ssa.OpAMD64ADDQconst {
|
|
asm = x86.ASUBQ
|
|
}
|
|
p := s.Prog(asm)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = -0x80
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
return
|
|
|
|
}
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
return
|
|
}
|
|
var asm obj.As
|
|
if v.Op == ssa.OpAMD64ADDQconst {
|
|
asm = x86.ALEAQ
|
|
} else {
|
|
asm = x86.ALEAL
|
|
}
|
|
p := s.Prog(asm)
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = a
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
|
|
case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
|
|
ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
|
|
ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
|
|
ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
|
|
ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
|
|
ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
|
|
ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
|
|
ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
|
|
ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
|
|
ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
|
|
ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
|
|
ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
|
|
// Flag condition: ^ZERO || PARITY
|
|
// Generate:
|
|
// CMOV*NE SRC,DST
|
|
// CMOV*PS SRC,DST
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
var q *obj.Prog
|
|
if v.Op == ssa.OpAMD64CMOVQNEF {
|
|
q = s.Prog(x86.ACMOVQPS)
|
|
} else if v.Op == ssa.OpAMD64CMOVLNEF {
|
|
q = s.Prog(x86.ACMOVLPS)
|
|
} else {
|
|
q = s.Prog(x86.ACMOVWPS)
|
|
}
|
|
q.From.Type = obj.TYPE_REG
|
|
q.From.Reg = v.Args[1].Reg()
|
|
q.To.Type = obj.TYPE_REG
|
|
q.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
|
|
// Flag condition: ZERO && !PARITY
|
|
// Generate:
|
|
// MOV SRC,TMP
|
|
// CMOV*NE DST,TMP
|
|
// CMOV*PC TMP,DST
|
|
//
|
|
// TODO(rasky): we could generate:
|
|
// CMOV*NE DST,SRC
|
|
// CMOV*PC SRC,DST
|
|
// But this requires a way for regalloc to know that SRC might be
|
|
// clobbered by this instruction.
|
|
t := v.RegTmp()
|
|
opregreg(s, moveByRegsWidth(t, v.Args[1].Reg(), v.Type.Size()), t, v.Args[1].Reg())
|
|
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = t
|
|
var q *obj.Prog
|
|
if v.Op == ssa.OpAMD64CMOVQEQF {
|
|
q = s.Prog(x86.ACMOVQPC)
|
|
} else if v.Op == ssa.OpAMD64CMOVLEQF {
|
|
q = s.Prog(x86.ACMOVLPC)
|
|
} else {
|
|
q = s.Prog(x86.ACMOVWPC)
|
|
}
|
|
q.From.Type = obj.TYPE_REG
|
|
q.From.Reg = t
|
|
q.To.Type = obj.TYPE_REG
|
|
q.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
|
|
r := v.Reg()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
p.AddRestSourceReg(v.Args[0].Reg())
|
|
|
|
case ssa.OpAMD64ANDQconst:
|
|
asm := v.Op.Asm()
|
|
// If the constant is positive and fits into 32 bits, use ANDL.
|
|
// This saves a few bytes of encoding.
|
|
if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
|
|
asm = x86.AANDL
|
|
}
|
|
p := s.Prog(asm)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
|
|
ssa.OpAMD64ANDLconst,
|
|
ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
|
|
ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
|
|
ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
|
|
ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
|
|
ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
|
|
ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
|
|
r := v.Reg()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = r
|
|
case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
|
|
ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
|
|
ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
|
|
p := s.Prog(v.Op.Asm())
|
|
memIdx(&p.From, v)
|
|
o := v.Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = o
|
|
if v.AuxInt != 0 && v.Aux == nil {
|
|
// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
|
|
switch v.Op {
|
|
case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
|
|
p = s.Prog(x86.ALEAQ)
|
|
case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
|
|
p = s.Prog(x86.ALEAL)
|
|
case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
|
|
p = s.Prog(x86.ALEAW)
|
|
}
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = o
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = o
|
|
}
|
|
ssagen.AddAux(&p.From, v)
|
|
case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
|
|
ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
|
|
ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
|
|
opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
|
|
case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
|
|
// Go assembler has swapped operands for UCOMISx relative to CMP,
|
|
// must account for that right here.
|
|
opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
|
|
case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_CONST
|
|
p.To.Offset = v.AuxInt
|
|
case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
|
|
ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
|
|
ssa.OpAMD64BTSQconst,
|
|
ssa.OpAMD64BTCQconst,
|
|
ssa.OpAMD64BTRQconst:
|
|
op := v.Op
|
|
if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
|
|
// Emit 32-bit version because it's shorter
|
|
op = ssa.OpAMD64BTLconst
|
|
}
|
|
p := s.Prog(op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Args[0].Reg()
|
|
case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Args[1].Reg()
|
|
case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux2(&p.From, v, sc.Off64())
|
|
p.To.Type = obj.TYPE_CONST
|
|
p.To.Offset = sc.Val64()
|
|
case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
|
|
p := s.Prog(v.Op.Asm())
|
|
memIdx(&p.From, v)
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Args[2].Reg()
|
|
case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
memIdx(&p.From, v)
|
|
ssagen.AddAux2(&p.From, v, sc.Off64())
|
|
p.To.Type = obj.TYPE_CONST
|
|
p.To.Offset = sc.Val64()
|
|
case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
|
|
x := v.Reg()
|
|
|
|
// If flags aren't live (indicated by v.Aux == nil),
|
|
// then we can rewrite MOV $0, AX into XOR AX, AX.
|
|
if v.AuxInt == 0 && v.Aux == nil {
|
|
opregreg(s, x86.AXORL, x, x)
|
|
break
|
|
}
|
|
|
|
asm := v.Op.Asm()
|
|
// Use MOVL to move a small constant into a register
|
|
// when the constant is positive and fits into 32 bits.
|
|
if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
|
|
// The upper 32bit are zeroed automatically when using MOVL.
|
|
asm = x86.AMOVL
|
|
}
|
|
p := s.Prog(asm)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x
|
|
|
|
case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
|
|
x := v.Reg()
|
|
if !isFPReg(x) && v.AuxInt == 0 && v.Aux == nil {
|
|
opregreg(s, x86.AXORL, x, x)
|
|
break
|
|
}
|
|
p := s.Prog(storeByRegWidth(x, v.Type.Size()))
|
|
p.From.Type = obj.TYPE_FCONST
|
|
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x
|
|
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
|
|
ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
|
|
ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
|
|
ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
|
|
ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
|
|
p := s.Prog(v.Op.Asm())
|
|
memIdx(&p.From, v)
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
|
|
ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
|
|
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
|
|
ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
|
|
ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
|
|
ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
|
|
ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
|
|
ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
|
|
ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
|
|
ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
|
|
ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[2].Reg()
|
|
memIdx(&p.To, v)
|
|
ssagen.AddAux(&p.To, v)
|
|
case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
|
|
sc := v.AuxValAndOff()
|
|
off := sc.Off64()
|
|
val := sc.Val()
|
|
if val == 1 || val == -1 {
|
|
var asm obj.As
|
|
if v.Op == ssa.OpAMD64ADDQconstmodify {
|
|
if val == 1 {
|
|
asm = x86.AINCQ
|
|
} else {
|
|
asm = x86.ADECQ
|
|
}
|
|
} else {
|
|
if val == 1 {
|
|
asm = x86.AINCL
|
|
} else {
|
|
asm = x86.ADECL
|
|
}
|
|
}
|
|
p := s.Prog(asm)
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux2(&p.To, v, off)
|
|
break
|
|
}
|
|
fallthrough
|
|
case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
|
|
ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
|
|
ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
|
|
sc := v.AuxValAndOff()
|
|
off := sc.Off64()
|
|
val := sc.Val64()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = val
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux2(&p.To, v, off)
|
|
|
|
case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
sc := v.AuxValAndOff()
|
|
p.From.Offset = sc.Val64()
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux2(&p.To, v, sc.Off64())
|
|
case ssa.OpAMD64MOVOstoreconst:
|
|
sc := v.AuxValAndOff()
|
|
if sc.Val() != 0 {
|
|
v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
|
|
}
|
|
|
|
if s.ABI != obj.ABIInternal {
|
|
// zero X15 manually
|
|
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
|
}
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = x86.REG_X15
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux2(&p.To, v, sc.Off64())
|
|
|
|
case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
|
|
ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
|
|
ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
|
|
ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
|
|
ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
sc := v.AuxValAndOff()
|
|
p.From.Offset = sc.Val64()
|
|
switch {
|
|
case p.As == x86.AADDQ && p.From.Offset == 1:
|
|
p.As = x86.AINCQ
|
|
p.From.Type = obj.TYPE_NONE
|
|
case p.As == x86.AADDQ && p.From.Offset == -1:
|
|
p.As = x86.ADECQ
|
|
p.From.Type = obj.TYPE_NONE
|
|
case p.As == x86.AADDL && p.From.Offset == 1:
|
|
p.As = x86.AINCL
|
|
p.From.Type = obj.TYPE_NONE
|
|
case p.As == x86.AADDL && p.From.Offset == -1:
|
|
p.As = x86.ADECL
|
|
p.From.Type = obj.TYPE_NONE
|
|
}
|
|
memIdx(&p.To, v)
|
|
ssagen.AddAux2(&p.To, v, sc.Off64())
|
|
case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
|
|
ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
|
|
ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
|
|
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
|
|
case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
|
|
r := v.Reg()
|
|
// Break false dependency on destination register.
|
|
opregreg(s, x86.AXORPS, r, r)
|
|
opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
|
|
case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
|
|
var p *obj.Prog
|
|
switch v.Op {
|
|
case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
|
|
p = s.Prog(x86.AMOVQ)
|
|
case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
|
|
p = s.Prog(x86.AMOVL)
|
|
}
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
|
|
ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
|
|
ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
|
|
ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
|
|
ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[1].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
|
|
ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
|
|
ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
|
|
ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
|
|
ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
|
|
ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
|
|
ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
|
|
ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
|
|
ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
|
|
p := s.Prog(v.Op.Asm())
|
|
|
|
r, i := v.Args[1].Reg(), v.Args[2].Reg()
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Scale = v.Op.Scale()
|
|
if p.From.Scale == 1 && i == x86.REG_SP {
|
|
r, i = i, r
|
|
}
|
|
p.From.Reg = r
|
|
p.From.Index = i
|
|
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64LoweredZero:
|
|
if s.ABI != obj.ABIInternal {
|
|
// zero X15 manually
|
|
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
|
}
|
|
ptrReg := v.Args[0].Reg()
|
|
n := v.AuxInt
|
|
if n < 16 {
|
|
v.Fatalf("Zero too small %d", n)
|
|
}
|
|
zero16 := func(off int64) {
|
|
zero16(s, ptrReg, off)
|
|
}
|
|
|
|
// Generate zeroing instructions.
|
|
var off int64
|
|
for n >= 16 {
|
|
zero16(off)
|
|
off += 16
|
|
n -= 16
|
|
}
|
|
if n != 0 {
|
|
// use partially overlapped write.
|
|
// TODO: n <= 8, use smaller write?
|
|
zero16(off + n - 16)
|
|
}
|
|
|
|
case ssa.OpAMD64LoweredZeroLoop:
|
|
if s.ABI != obj.ABIInternal {
|
|
// zero X15 manually
|
|
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
|
}
|
|
ptrReg := v.Args[0].Reg()
|
|
countReg := v.RegTmp()
|
|
n := v.AuxInt
|
|
loopSize := int64(64)
|
|
if n < 3*loopSize {
|
|
// - a loop count of 0 won't work.
|
|
// - a loop count of 1 is useless.
|
|
// - a loop count of 2 is a code size ~tie
|
|
// 4 instructions to implement the loop
|
|
// 4 instructions in the loop body
|
|
// vs
|
|
// 8 instructions in the straightline code
|
|
// Might as well use straightline code.
|
|
v.Fatalf("ZeroLoop size too small %d", n)
|
|
}
|
|
zero16 := func(off int64) {
|
|
zero16(s, ptrReg, off)
|
|
}
|
|
|
|
// Put iteration count in a register.
|
|
// MOVL $n, countReg
|
|
p := s.Prog(x86.AMOVL)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = n / loopSize
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = countReg
|
|
cntInit := p
|
|
|
|
// Zero loopSize bytes starting at ptrReg.
|
|
for i := range loopSize / 16 {
|
|
zero16(i * 16)
|
|
}
|
|
// ADDQ $loopSize, ptrReg
|
|
p = s.Prog(x86.AADDQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = loopSize
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = ptrReg
|
|
// DECL countReg
|
|
p = s.Prog(x86.ADECL)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = countReg
|
|
// Jump to first instruction in loop if we're not done yet.
|
|
// JNE head
|
|
p = s.Prog(x86.AJNE)
|
|
p.To.Type = obj.TYPE_BRANCH
|
|
p.To.SetTarget(cntInit.Link)
|
|
|
|
// Multiples of the loop size are now done.
|
|
n %= loopSize
|
|
|
|
// Write any fractional portion.
|
|
var off int64
|
|
for n >= 16 {
|
|
zero16(off)
|
|
off += 16
|
|
n -= 16
|
|
}
|
|
if n != 0 {
|
|
// Use partially-overlapping write.
|
|
// TODO: n <= 8, use smaller write?
|
|
zero16(off + n - 16)
|
|
}
|
|
|
|
case ssa.OpAMD64LoweredMove:
|
|
dstReg := v.Args[0].Reg()
|
|
srcReg := v.Args[1].Reg()
|
|
if dstReg == srcReg {
|
|
break
|
|
}
|
|
tmpReg := int16(x86.REG_X14)
|
|
n := v.AuxInt
|
|
if n < 16 {
|
|
v.Fatalf("Move too small %d", n)
|
|
}
|
|
// move 16 bytes from srcReg+off to dstReg+off.
|
|
move16 := func(off int64) {
|
|
move16(s, srcReg, dstReg, tmpReg, off)
|
|
}
|
|
|
|
// Generate copying instructions.
|
|
var off int64
|
|
for n >= 16 {
|
|
move16(off)
|
|
off += 16
|
|
n -= 16
|
|
}
|
|
if n != 0 {
|
|
// use partially overlapped read/write.
|
|
// TODO: use smaller operations when we can?
|
|
move16(off + n - 16)
|
|
}
|
|
|
|
case ssa.OpAMD64LoweredMoveLoop:
|
|
dstReg := v.Args[0].Reg()
|
|
srcReg := v.Args[1].Reg()
|
|
if dstReg == srcReg {
|
|
break
|
|
}
|
|
countReg := v.RegTmp()
|
|
tmpReg := int16(x86.REG_X14)
|
|
n := v.AuxInt
|
|
loopSize := int64(64)
|
|
if n < 3*loopSize {
|
|
// - a loop count of 0 won't work.
|
|
// - a loop count of 1 is useless.
|
|
// - a loop count of 2 is a code size ~tie
|
|
// 4 instructions to implement the loop
|
|
// 4 instructions in the loop body
|
|
// vs
|
|
// 8 instructions in the straightline code
|
|
// Might as well use straightline code.
|
|
v.Fatalf("ZeroLoop size too small %d", n)
|
|
}
|
|
// move 16 bytes from srcReg+off to dstReg+off.
|
|
move16 := func(off int64) {
|
|
move16(s, srcReg, dstReg, tmpReg, off)
|
|
}
|
|
|
|
// Put iteration count in a register.
|
|
// MOVL $n, countReg
|
|
p := s.Prog(x86.AMOVL)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = n / loopSize
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = countReg
|
|
cntInit := p
|
|
|
|
// Copy loopSize bytes starting at srcReg to dstReg.
|
|
for i := range loopSize / 16 {
|
|
move16(i * 16)
|
|
}
|
|
// ADDQ $loopSize, srcReg
|
|
p = s.Prog(x86.AADDQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = loopSize
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = srcReg
|
|
// ADDQ $loopSize, dstReg
|
|
p = s.Prog(x86.AADDQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = loopSize
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = dstReg
|
|
// DECL countReg
|
|
p = s.Prog(x86.ADECL)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = countReg
|
|
// Jump to loop header if we're not done yet.
|
|
// JNE head
|
|
p = s.Prog(x86.AJNE)
|
|
p.To.Type = obj.TYPE_BRANCH
|
|
p.To.SetTarget(cntInit.Link)
|
|
|
|
// Multiples of the loop size are now done.
|
|
n %= loopSize
|
|
|
|
// Copy any fractional portion.
|
|
var off int64
|
|
for n >= 16 {
|
|
move16(off)
|
|
off += 16
|
|
n -= 16
|
|
}
|
|
if n != 0 {
|
|
// Use partially-overlapping copy.
|
|
move16(off + n - 16)
|
|
}
|
|
|
|
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
|
|
if v.Type.IsMemory() {
|
|
return
|
|
}
|
|
x := v.Args[0].Reg()
|
|
y := v.Reg()
|
|
if v.Type.IsSIMD() {
|
|
x = simdOrMaskReg(v.Args[0])
|
|
y = simdOrMaskReg(v)
|
|
}
|
|
if x != y {
|
|
opregreg(s, moveByRegsWidth(y, x, v.Type.Size()), y, x)
|
|
}
|
|
case ssa.OpLoadReg:
|
|
if v.Type.IsFlags() {
|
|
v.Fatalf("load flags not implemented: %v", v.LongString())
|
|
return
|
|
}
|
|
r := v.Reg()
|
|
p := s.Prog(loadByRegWidth(r, v.Type.Size()))
|
|
ssagen.AddrAuto(&p.From, v.Args[0])
|
|
p.To.Type = obj.TYPE_REG
|
|
if v.Type.IsSIMD() {
|
|
r = simdOrMaskReg(v)
|
|
}
|
|
p.To.Reg = r
|
|
|
|
case ssa.OpStoreReg:
|
|
if v.Type.IsFlags() {
|
|
v.Fatalf("store flags not implemented: %v", v.LongString())
|
|
return
|
|
}
|
|
r := v.Args[0].Reg()
|
|
if v.Type.IsSIMD() {
|
|
r = simdOrMaskReg(v.Args[0])
|
|
}
|
|
p := s.Prog(storeByRegWidth(r, v.Type.Size()))
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = r
|
|
ssagen.AddrAuto(&p.To, v)
|
|
case ssa.OpAMD64LoweredHasCPUFeature:
|
|
p := s.Prog(x86.AMOVBLZX)
|
|
p.From.Type = obj.TYPE_MEM
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpArgIntReg, ssa.OpArgFloatReg:
|
|
// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
|
|
// The loop only runs once.
|
|
for _, ap := range v.Block.Func.RegArgs {
|
|
// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
|
|
addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
|
|
reg := ap.Reg
|
|
t := ap.Type
|
|
sz := t.Size()
|
|
if t.IsSIMD() {
|
|
reg = simdRegBySize(reg, sz)
|
|
}
|
|
s.FuncInfo().AddSpill(
|
|
obj.RegSpill{Reg: reg, Addr: addr, Unspill: loadByRegWidth(reg, sz), Spill: storeByRegWidth(reg, sz)})
|
|
}
|
|
v.Block.Func.RegArgs = nil
|
|
ssagen.CheckArgReg(v)
|
|
case ssa.OpAMD64LoweredGetClosurePtr:
|
|
// Closure pointer is DX.
|
|
ssagen.CheckLoweredGetClosurePtr(v)
|
|
case ssa.OpAMD64LoweredGetG:
|
|
if s.ABI == obj.ABIInternal {
|
|
v.Fatalf("LoweredGetG should not appear in ABIInternal")
|
|
}
|
|
r := v.Reg()
|
|
getgFromTLS(s, r)
|
|
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
|
|
if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
|
|
// zeroing X15 when entering ABIInternal from ABI0
|
|
zeroX15(s)
|
|
// set G register from TLS
|
|
getgFromTLS(s, x86.REG_R14)
|
|
}
|
|
if v.Op == ssa.OpAMD64CALLtail {
|
|
s.TailCall(v)
|
|
break
|
|
}
|
|
s.Call(v)
|
|
if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
|
|
// zeroing X15 when entering ABIInternal from ABI0
|
|
zeroX15(s)
|
|
// set G register from TLS
|
|
getgFromTLS(s, x86.REG_R14)
|
|
}
|
|
case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
|
|
s.Call(v)
|
|
|
|
case ssa.OpAMD64LoweredGetCallerPC:
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
|
|
p.From.Name = obj.NAME_PARAM
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64LoweredGetCallerSP:
|
|
// caller's SP is the address of the first arg
|
|
mov := x86.AMOVQ
|
|
if types.PtrSize == 4 {
|
|
mov = x86.AMOVL
|
|
}
|
|
p := s.Prog(mov)
|
|
p.From.Type = obj.TYPE_ADDR
|
|
p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures
|
|
p.From.Name = obj.NAME_PARAM
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64LoweredWB:
|
|
p := s.Prog(obj.ACALL)
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Name = obj.NAME_EXTERN
|
|
// AuxInt encodes how many buffer entries we need.
|
|
p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
|
|
|
|
case ssa.OpAMD64LoweredPanicBoundsRR, ssa.OpAMD64LoweredPanicBoundsRC, ssa.OpAMD64LoweredPanicBoundsCR, ssa.OpAMD64LoweredPanicBoundsCC:
|
|
// Compute the constant we put in the PCData entry for this call.
|
|
code, signed := ssa.BoundsKind(v.AuxInt).Code()
|
|
xIsReg := false
|
|
yIsReg := false
|
|
xVal := 0
|
|
yVal := 0
|
|
switch v.Op {
|
|
case ssa.OpAMD64LoweredPanicBoundsRR:
|
|
xIsReg = true
|
|
xVal = int(v.Args[0].Reg() - x86.REG_AX)
|
|
yIsReg = true
|
|
yVal = int(v.Args[1].Reg() - x86.REG_AX)
|
|
case ssa.OpAMD64LoweredPanicBoundsRC:
|
|
xIsReg = true
|
|
xVal = int(v.Args[0].Reg() - x86.REG_AX)
|
|
c := v.Aux.(ssa.PanicBoundsC).C
|
|
if c >= 0 && c <= abi.BoundsMaxConst {
|
|
yVal = int(c)
|
|
} else {
|
|
// Move constant to a register
|
|
yIsReg = true
|
|
if yVal == xVal {
|
|
yVal = 1
|
|
}
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = c
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x86.REG_AX + int16(yVal)
|
|
}
|
|
case ssa.OpAMD64LoweredPanicBoundsCR:
|
|
yIsReg = true
|
|
yVal = int(v.Args[0].Reg() - x86.REG_AX)
|
|
c := v.Aux.(ssa.PanicBoundsC).C
|
|
if c >= 0 && c <= abi.BoundsMaxConst {
|
|
xVal = int(c)
|
|
} else {
|
|
// Move constant to a register
|
|
xIsReg = true
|
|
if xVal == yVal {
|
|
xVal = 1
|
|
}
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = c
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x86.REG_AX + int16(xVal)
|
|
}
|
|
case ssa.OpAMD64LoweredPanicBoundsCC:
|
|
c := v.Aux.(ssa.PanicBoundsCC).Cx
|
|
if c >= 0 && c <= abi.BoundsMaxConst {
|
|
xVal = int(c)
|
|
} else {
|
|
// Move constant to a register
|
|
xIsReg = true
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = c
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x86.REG_AX + int16(xVal)
|
|
}
|
|
c = v.Aux.(ssa.PanicBoundsCC).Cy
|
|
if c >= 0 && c <= abi.BoundsMaxConst {
|
|
yVal = int(c)
|
|
} else {
|
|
// Move constant to a register
|
|
yIsReg = true
|
|
yVal = 1
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = c
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x86.REG_AX + int16(yVal)
|
|
}
|
|
}
|
|
c := abi.BoundsEncode(code, signed, xIsReg, yIsReg, xVal, yVal)
|
|
|
|
p := s.Prog(obj.APCDATA)
|
|
p.From.SetConst(abi.PCDATA_PanicBounds)
|
|
p.To.SetConst(int64(c))
|
|
p = s.Prog(obj.ACALL)
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Name = obj.NAME_EXTERN
|
|
p.To.Sym = ir.Syms.PanicBounds
|
|
|
|
case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
|
|
ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
|
|
ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64NEGLflags:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg0()
|
|
|
|
case ssa.OpAMD64ADDQconstflags, ssa.OpAMD64ADDLconstflags:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = v.AuxInt
|
|
// Note: the inc/dec instructions do not modify
|
|
// the carry flag like add$1 / sub$1 do.
|
|
// We currently never use the CF/OF flags from
|
|
// these instructions, so that is ok.
|
|
switch {
|
|
case p.As == x86.AADDQ && p.From.Offset == 1:
|
|
p.As = x86.AINCQ
|
|
p.From.Type = obj.TYPE_NONE
|
|
case p.As == x86.AADDQ && p.From.Offset == -1:
|
|
p.As = x86.ADECQ
|
|
p.From.Type = obj.TYPE_NONE
|
|
case p.As == x86.AADDL && p.From.Offset == 1:
|
|
p.As = x86.AINCL
|
|
p.From.Type = obj.TYPE_NONE
|
|
case p.As == x86.AADDL && p.From.Offset == -1:
|
|
p.As = x86.ADECL
|
|
p.From.Type = obj.TYPE_NONE
|
|
}
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg0()
|
|
|
|
case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
switch v.Op {
|
|
case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
|
|
p.To.Reg = v.Reg0()
|
|
case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
|
|
p.To.Reg = v.Reg()
|
|
}
|
|
case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
|
|
// input is already rounded
|
|
case ssa.OpAMD64ROUNDSD:
|
|
p := s.Prog(v.Op.Asm())
|
|
val := v.AuxInt
|
|
// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
|
|
if val < 0 || val > 3 {
|
|
v.Fatalf("Invalid rounding mode")
|
|
}
|
|
p.From.Offset = val
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(v.Args[0].Reg())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
|
|
ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
|
|
ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
|
|
if v.Args[0].Reg() != v.Reg() {
|
|
// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
|
|
// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
|
|
// Xor register with itself to break the dependency.
|
|
opregreg(s, x86.AXORL, v.Reg(), v.Reg())
|
|
}
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
|
|
ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
|
|
ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
|
|
ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
|
|
ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
|
|
ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
|
|
ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
|
|
ssa.OpAMD64SETO:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
|
|
ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
|
|
ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
|
|
ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
|
|
ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
|
|
case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
|
|
ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
|
|
ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
|
|
ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
|
|
ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
|
|
p := s.Prog(v.Op.Asm())
|
|
memIdx(&p.To, v)
|
|
ssagen.AddAux(&p.To, v)
|
|
|
|
case ssa.OpAMD64SETNEF:
|
|
t := v.RegTmp()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
q := s.Prog(x86.ASETPS)
|
|
q.To.Type = obj.TYPE_REG
|
|
q.To.Reg = t
|
|
// ORL avoids partial register write and is smaller than ORQ, used by old compiler
|
|
opregreg(s, x86.AORL, v.Reg(), t)
|
|
|
|
case ssa.OpAMD64SETEQF:
|
|
t := v.RegTmp()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
q := s.Prog(x86.ASETPC)
|
|
q.To.Type = obj.TYPE_REG
|
|
q.To.Reg = t
|
|
// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
|
|
opregreg(s, x86.AANDL, v.Reg(), t)
|
|
|
|
case ssa.OpAMD64InvertFlags:
|
|
v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
|
|
case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
|
|
v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
|
|
case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
|
|
v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
|
|
case ssa.OpAMD64REPSTOSQ:
|
|
s.Prog(x86.AREP)
|
|
s.Prog(x86.ASTOSQ)
|
|
case ssa.OpAMD64REPMOVSQ:
|
|
s.Prog(x86.AREP)
|
|
s.Prog(x86.AMOVSQ)
|
|
case ssa.OpAMD64LoweredNilCheck:
|
|
// Issue a load which will fault if the input is nil.
|
|
// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
|
|
// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
|
|
// but it doesn't have false dependency on AX.
|
|
// Or maybe allocate an output register and use MOVL (reg),reg2 ?
|
|
// That trades clobbering flags for clobbering a register.
|
|
p := s.Prog(x86.ATESTB)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = x86.REG_AX
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
if logopt.Enabled() {
|
|
logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
|
|
}
|
|
if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
|
|
base.WarnfAt(v.Pos, "generated nil check")
|
|
}
|
|
case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg0()
|
|
case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Reg0()
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[1].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
|
|
s.Prog(x86.ALOCK)
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Reg0()
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[1].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
|
|
if v.Args[1].Reg() != x86.REG_AX {
|
|
v.Fatalf("input[1] not in AX %s", v.LongString())
|
|
}
|
|
s.Prog(x86.ALOCK)
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[2].Reg()
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
p = s.Prog(x86.ASETEQ)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg0()
|
|
case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
|
|
// Atomic memory operations that don't need to return the old value.
|
|
s.Prog(x86.ALOCK)
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
|
|
// Atomic memory operations that need to return the old value.
|
|
// We need to do these with compare-and-exchange to get access to the old value.
|
|
// loop:
|
|
// MOVQ mask, tmp
|
|
// MOVQ (addr), AX
|
|
// ANDQ AX, tmp
|
|
// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
|
|
// JNE loop
|
|
// : result in AX
|
|
mov := x86.AMOVQ
|
|
op := x86.AANDQ
|
|
cmpxchg := x86.ACMPXCHGQ
|
|
switch v.Op {
|
|
case ssa.OpAMD64LoweredAtomicOr64:
|
|
op = x86.AORQ
|
|
case ssa.OpAMD64LoweredAtomicAnd32:
|
|
mov = x86.AMOVL
|
|
op = x86.AANDL
|
|
cmpxchg = x86.ACMPXCHGL
|
|
case ssa.OpAMD64LoweredAtomicOr32:
|
|
mov = x86.AMOVL
|
|
op = x86.AORL
|
|
cmpxchg = x86.ACMPXCHGL
|
|
}
|
|
addr := v.Args[0].Reg()
|
|
mask := v.Args[1].Reg()
|
|
tmp := v.RegTmp()
|
|
p1 := s.Prog(mov)
|
|
p1.From.Type = obj.TYPE_REG
|
|
p1.From.Reg = mask
|
|
p1.To.Type = obj.TYPE_REG
|
|
p1.To.Reg = tmp
|
|
p2 := s.Prog(mov)
|
|
p2.From.Type = obj.TYPE_MEM
|
|
p2.From.Reg = addr
|
|
ssagen.AddAux(&p2.From, v)
|
|
p2.To.Type = obj.TYPE_REG
|
|
p2.To.Reg = x86.REG_AX
|
|
p3 := s.Prog(op)
|
|
p3.From.Type = obj.TYPE_REG
|
|
p3.From.Reg = x86.REG_AX
|
|
p3.To.Type = obj.TYPE_REG
|
|
p3.To.Reg = tmp
|
|
s.Prog(x86.ALOCK)
|
|
p5 := s.Prog(cmpxchg)
|
|
p5.From.Type = obj.TYPE_REG
|
|
p5.From.Reg = tmp
|
|
p5.To.Type = obj.TYPE_MEM
|
|
p5.To.Reg = addr
|
|
ssagen.AddAux(&p5.To, v)
|
|
p6 := s.Prog(x86.AJNE)
|
|
p6.To.Type = obj.TYPE_BRANCH
|
|
p6.To.SetTarget(p1)
|
|
case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
case ssa.OpClobber:
|
|
p := s.Prog(x86.AMOVL)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = 0xdeaddead
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = x86.REG_SP
|
|
ssagen.AddAux(&p.To, v)
|
|
p = s.Prog(x86.AMOVL)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = 0xdeaddead
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = x86.REG_SP
|
|
ssagen.AddAux(&p.To, v)
|
|
p.To.Offset += 4
|
|
case ssa.OpClobberReg:
|
|
x := uint64(0xdeaddeaddeaddead)
|
|
p := s.Prog(x86.AMOVQ)
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = int64(x)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
// SIMD ops
|
|
case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
|
|
s.Prog(v.Op.Asm())
|
|
|
|
case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted
|
|
|
|
case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
|
|
// These are for initializing the least 32/64 bits of a SIMD register from a "float".
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.AddRestSourceReg(x86.REG_X15)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
|
|
case ssa.OpAMD64VMOVQload, ssa.OpAMD64VMOVDload,
|
|
ssa.OpAMD64VMOVSSload, ssa.OpAMD64VMOVSDload:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
|
|
case ssa.OpAMD64VMOVSSconst, ssa.OpAMD64VMOVSDconst:
|
|
// for loading constants directly into SIMD registers
|
|
x := simdReg(v)
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_FCONST
|
|
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x
|
|
|
|
case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ:
|
|
// These are for initializing the least 32/64 bits of a SIMD register from an "int".
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
|
|
case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512,
|
|
ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdOrMaskReg(v)
|
|
case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512,
|
|
ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdOrMaskReg(v.Args[1])
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
|
|
case ssa.OpAMD64VPMASK32load128, ssa.OpAMD64VPMASK64load128, ssa.OpAMD64VPMASK32load256, ssa.OpAMD64VPMASK64load256:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
|
|
|
|
case ssa.OpAMD64VPMASK32store128, ssa.OpAMD64VPMASK64store128, ssa.OpAMD64VPMASK32store256, ssa.OpAMD64VPMASK64store256:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[2])
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
|
|
|
|
case ssa.OpAMD64VPMASK64load512, ssa.OpAMD64VPMASK32load512, ssa.OpAMD64VPMASK16load512, ssa.OpAMD64VPMASK8load512:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
|
|
x86.ParseSuffix(p, "Z") // must be zero if not in mask
|
|
|
|
case ssa.OpAMD64VPMASK64store512, ssa.OpAMD64VPMASK32store512, ssa.OpAMD64VPMASK16store512, ssa.OpAMD64VPMASK8store512:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[2])
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.To, v)
|
|
p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
|
|
|
|
case ssa.OpAMD64VPMOVMToVec8x16,
|
|
ssa.OpAMD64VPMOVMToVec8x32,
|
|
ssa.OpAMD64VPMOVMToVec8x64,
|
|
ssa.OpAMD64VPMOVMToVec16x8,
|
|
ssa.OpAMD64VPMOVMToVec16x16,
|
|
ssa.OpAMD64VPMOVMToVec16x32,
|
|
ssa.OpAMD64VPMOVMToVec32x4,
|
|
ssa.OpAMD64VPMOVMToVec32x8,
|
|
ssa.OpAMD64VPMOVMToVec32x16,
|
|
ssa.OpAMD64VPMOVMToVec64x2,
|
|
ssa.OpAMD64VPMOVMToVec64x4,
|
|
ssa.OpAMD64VPMOVMToVec64x8:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
|
|
case ssa.OpAMD64VPMOVVec8x16ToM,
|
|
ssa.OpAMD64VPMOVVec8x32ToM,
|
|
ssa.OpAMD64VPMOVVec8x64ToM,
|
|
ssa.OpAMD64VPMOVVec16x8ToM,
|
|
ssa.OpAMD64VPMOVVec16x16ToM,
|
|
ssa.OpAMD64VPMOVVec16x32ToM,
|
|
ssa.OpAMD64VPMOVVec32x4ToM,
|
|
ssa.OpAMD64VPMOVVec32x8ToM,
|
|
ssa.OpAMD64VPMOVVec32x16ToM,
|
|
ssa.OpAMD64VPMOVVec64x2ToM,
|
|
ssa.OpAMD64VPMOVVec64x4ToM,
|
|
ssa.OpAMD64VPMOVVec64x8ToM:
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[0])
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
|
|
case ssa.OpAMD64KMOVQk, ssa.OpAMD64KMOVDk, ssa.OpAMD64KMOVWk, ssa.OpAMD64KMOVBk,
|
|
ssa.OpAMD64KMOVQi, ssa.OpAMD64KMOVDi, ssa.OpAMD64KMOVWi, ssa.OpAMD64KMOVBi:
|
|
// See also ssa.OpAMD64KMOVQload
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[0].Reg()
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
case ssa.OpAMD64VPTEST:
|
|
// Some instructions setting flags put their second operand into the destination reg.
|
|
// See also CMP[BWDQ].
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[0])
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v.Args[1])
|
|
|
|
default:
|
|
if !ssaGenSIMDValue(s, v) {
|
|
v.Fatalf("genValue not implemented: %s", v.LongString())
|
|
}
|
|
}
|
|
}
|
|
|
|
// zeroX15 zeroes the X15 register.
|
|
func zeroX15(s *ssagen.State) {
|
|
vxorps := func(s *ssagen.State) {
|
|
p := s.Prog(x86.AVXORPS)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = x86.REG_X15
|
|
p.AddRestSourceReg(x86.REG_X15)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = x86.REG_X15
|
|
}
|
|
if buildcfg.GOAMD64 >= 3 {
|
|
vxorps(s)
|
|
return
|
|
}
|
|
// AVX may not be available, check before zeroing the high bits.
|
|
p := s.Prog(x86.ACMPB)
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Name = obj.NAME_EXTERN
|
|
p.From.Sym = ir.Syms.X86HasAVX
|
|
p.To.Type = obj.TYPE_CONST
|
|
p.To.Offset = 1
|
|
jmp := s.Prog(x86.AJNE)
|
|
jmp.To.Type = obj.TYPE_BRANCH
|
|
vxorps(s)
|
|
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
|
jmp.To.SetTarget(sse)
|
|
}
|
|
|
|
// Example instruction: VRSQRTPS X1, X1
|
|
func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[0])
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPSUBD X1, X2, X3
|
|
func simdV21(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
// Vector registers operands follows a right-to-left order.
|
|
// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
|
|
p.From.Reg = simdReg(v.Args[1])
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// This function is to accustomize the shifts.
|
|
// The 2nd arg is an XMM, and this function merely checks that.
|
|
// Example instruction: VPSLLQ Z1, X1, Z2
|
|
func simdVfpv(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
// Vector registers operands follows a right-to-left order.
|
|
// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPCMPEQW Z26, Z30, K4
|
|
func simdV2k(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[1])
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPMINUQ X21, X3, K3, X31
|
|
func simdV2kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[1])
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
// These "simd*" series of functions assumes:
|
|
// Any "K" register that serves as the write-mask
|
|
// or "predicate" for "predicated AVX512 instructions"
|
|
// sits right at the end of the operand list.
|
|
// TODO: verify this assumption.
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPABSB X1, X2, K3 (masking merging)
|
|
func simdV2kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[1])
|
|
// These "simd*" series of functions assumes:
|
|
// Any "K" register that serves as the write-mask
|
|
// or "predicate" for "predicated AVX512 instructions"
|
|
// sits right at the end of the operand list.
|
|
// TODO: verify this assumption.
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// This function is to accustomize the shifts.
|
|
// The 2nd arg is an XMM, and this function merely checks that.
|
|
// Example instruction: VPSLLQ Z1, X1, K1, Z2
|
|
func simdVfpkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = v.Args[1].Reg()
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPCMPEQW Z26, Z30, K1, K4
|
|
func simdV2kk(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[1])
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPOPCNTB X14, K4, X16
|
|
func simdVkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[0])
|
|
p.AddRestSourceReg(maskReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VROUNDPD $7, X2, X2
|
|
func simdV11Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VREDUCEPD $126, X1, K3, X31
|
|
func simdVkvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VCMPPS $7, X2, X9, X2
|
|
func simdV21Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPINSRB $3, DX, X0, X0
|
|
func simdVgpvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(v.Args[1].Reg())
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPCMPD $1, Z1, Z2, K1
|
|
func simdV2kImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPCMPD $1, Z1, Z2, K2, K1
|
|
func simdV2kkImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
func simdV2kvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VFMADD213PD Z2, Z1, Z0
|
|
func simdV31ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[2])
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
func simdV31ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
|
|
p.AddRestSourceReg(simdReg(v.Args[2]))
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
// p.AddRestSourceReg(x86.REG_K0)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// v31loadResultInArg0Imm8
|
|
// Example instruction:
|
|
// for (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
|
|
func simdV31loadResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[2].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VFMADD213PD Z2, Z1, K1, Z0
|
|
func simdV3kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[2])
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(maskReg(v.Args[3]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
func simdVgpImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = v.Reg()
|
|
return p
|
|
}
|
|
|
|
// Currently unused
|
|
func simdV31(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[2])
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Currently unused
|
|
func simdV3kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[2])
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[3]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VRCP14PS (DI), K6, X22
|
|
func simdVkvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.AddRestSourceReg(maskReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPSLLVD (DX), X7, X18
|
|
func simdV21load(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[1].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPDPWSSD (SI), X24, X18
|
|
func simdV31loadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[2].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPDPWSSD (SI), X24, K1, X18
|
|
func simdV3kvloadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[2].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.AddRestSourceReg(maskReg(v.Args[3]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPSLLVD (SI), X1, K1, X2
|
|
func simdV2kvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[1].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPCMPEQD (SI), X1, K1
|
|
func simdV2kload(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[1].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VCVTTPS2DQ (BX), X2
|
|
func simdV11load(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = v.Args[0].Reg()
|
|
ssagen.AddAux(&p.From, v)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPSHUFD $7, (BX), X11
|
|
func simdV11loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPRORD $81, -15(R14), K7, Y1
|
|
func simdVkvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
p.AddRestSourceReg(maskReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VPSHLDD $82, 7(SI), Y21, Y3
|
|
func simdV21loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VCMPPS $81, -7(DI), Y16, K3
|
|
func simdV2kloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VCMPPS $81, -7(DI), Y16, K1, K3
|
|
func simdV2kkloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = maskReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: VGF2P8AFFINEINVQB $64, -17(BP), X31, K3, X26
|
|
func simdV2kvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
sc := v.AuxValAndOff()
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.From.Offset = sc.Val64()
|
|
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
|
|
ssagen.AddAux2(&m, v, sc.Off64())
|
|
p.AddRestSource(m)
|
|
p.AddRestSourceReg(simdReg(v.Args[0]))
|
|
p.AddRestSourceReg(maskReg(v.Args[2]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: SHA1NEXTE X2, X2
|
|
func simdV21ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = simdReg(v.Args[1])
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: SHA1RNDS4 $1, X2, X2
|
|
func simdV21ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
p := s.Prog(v.Op.Asm())
|
|
p.From.Offset = int64(v.AuxUInt8())
|
|
p.From.Type = obj.TYPE_CONST
|
|
p.AddRestSourceReg(simdReg(v.Args[1]))
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = simdReg(v)
|
|
return p
|
|
}
|
|
|
|
// Example instruction: SHA256RNDS2 X0, X11, X2
|
|
func simdV31x0AtIn2ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
|
return simdV31ResultInArg0(s, v)
|
|
}
|
|
|
|
var blockJump = [...]struct {
|
|
asm, invasm obj.As
|
|
}{
|
|
ssa.BlockAMD64EQ: {x86.AJEQ, x86.AJNE},
|
|
ssa.BlockAMD64NE: {x86.AJNE, x86.AJEQ},
|
|
ssa.BlockAMD64LT: {x86.AJLT, x86.AJGE},
|
|
ssa.BlockAMD64GE: {x86.AJGE, x86.AJLT},
|
|
ssa.BlockAMD64LE: {x86.AJLE, x86.AJGT},
|
|
ssa.BlockAMD64GT: {x86.AJGT, x86.AJLE},
|
|
ssa.BlockAMD64OS: {x86.AJOS, x86.AJOC},
|
|
ssa.BlockAMD64OC: {x86.AJOC, x86.AJOS},
|
|
ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
|
|
ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
|
|
ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
|
|
ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
|
|
ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
|
|
ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
|
|
}
|
|
|
|
var eqfJumps = [2][2]ssagen.IndexJump{
|
|
{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
|
|
{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
|
|
}
|
|
var nefJumps = [2][2]ssagen.IndexJump{
|
|
{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
|
|
{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
|
|
}
|
|
|
|
func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
|
|
switch b.Kind {
|
|
case ssa.BlockPlain, ssa.BlockDefer:
|
|
if b.Succs[0].Block() != next {
|
|
p := s.Prog(obj.AJMP)
|
|
p.To.Type = obj.TYPE_BRANCH
|
|
s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
|
|
}
|
|
case ssa.BlockExit, ssa.BlockRetJmp:
|
|
case ssa.BlockRet:
|
|
s.Prog(obj.ARET)
|
|
|
|
case ssa.BlockAMD64EQF:
|
|
s.CombJump(b, next, &eqfJumps)
|
|
|
|
case ssa.BlockAMD64NEF:
|
|
s.CombJump(b, next, &nefJumps)
|
|
|
|
case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
|
|
ssa.BlockAMD64LT, ssa.BlockAMD64GE,
|
|
ssa.BlockAMD64LE, ssa.BlockAMD64GT,
|
|
ssa.BlockAMD64OS, ssa.BlockAMD64OC,
|
|
ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
|
|
ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
|
|
jmp := blockJump[b.Kind]
|
|
switch next {
|
|
case b.Succs[0].Block():
|
|
s.Br(jmp.invasm, b.Succs[1].Block())
|
|
case b.Succs[1].Block():
|
|
s.Br(jmp.asm, b.Succs[0].Block())
|
|
default:
|
|
if b.Likely != ssa.BranchUnlikely {
|
|
s.Br(jmp.asm, b.Succs[0].Block())
|
|
s.Br(obj.AJMP, b.Succs[1].Block())
|
|
} else {
|
|
s.Br(jmp.invasm, b.Succs[1].Block())
|
|
s.Br(obj.AJMP, b.Succs[0].Block())
|
|
}
|
|
}
|
|
|
|
case ssa.BlockAMD64JUMPTABLE:
|
|
// JMP *(TABLE)(INDEX*8)
|
|
p := s.Prog(obj.AJMP)
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = b.Controls[1].Reg()
|
|
p.To.Index = b.Controls[0].Reg()
|
|
p.To.Scale = 8
|
|
// Save jump tables for later resolution of the target blocks.
|
|
s.JumpTables = append(s.JumpTables, b)
|
|
|
|
default:
|
|
b.Fatalf("branch not implemented: %s", b.LongString())
|
|
}
|
|
}
|
|
|
|
func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
|
|
p := s.Prog(loadByRegWidth(reg, t.Size()))
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Name = obj.NAME_AUTO
|
|
p.From.Sym = n.Linksym()
|
|
p.From.Offset = n.FrameOffset() + off
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = reg
|
|
return p
|
|
}
|
|
|
|
func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
|
|
p = pp.Append(p, storeByRegWidth(reg, t.Size()), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
|
|
p.To.Name = obj.NAME_PARAM
|
|
p.To.Sym = n.Linksym()
|
|
p.Pos = p.Pos.WithNotStmt()
|
|
return p
|
|
}
|
|
|
|
// zero 16 bytes at reg+off.
|
|
func zero16(s *ssagen.State, reg int16, off int64) {
|
|
// MOVUPS X15, off(ptrReg)
|
|
p := s.Prog(x86.AMOVUPS)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = x86.REG_X15
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = reg
|
|
p.To.Offset = off
|
|
}
|
|
|
|
// move 16 bytes from src+off to dst+off using temporary register tmp.
|
|
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
|
|
// MOVUPS off(srcReg), tmpReg
|
|
// MOVUPS tmpReg, off(dstReg)
|
|
p := s.Prog(x86.AMOVUPS)
|
|
p.From.Type = obj.TYPE_MEM
|
|
p.From.Reg = src
|
|
p.From.Offset = off
|
|
p.To.Type = obj.TYPE_REG
|
|
p.To.Reg = tmp
|
|
p = s.Prog(x86.AMOVUPS)
|
|
p.From.Type = obj.TYPE_REG
|
|
p.From.Reg = tmp
|
|
p.To.Type = obj.TYPE_MEM
|
|
p.To.Reg = dst
|
|
p.To.Offset = off
|
|
}
|
|
|
|
// XXX maybe make this part of v.Reg?
|
|
// On the other hand, it is architecture-specific.
|
|
func simdReg(v *ssa.Value) int16 {
|
|
t := v.Type
|
|
if !t.IsSIMD() {
|
|
base.Fatalf("simdReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
|
|
}
|
|
return simdRegBySize(v.Reg(), t.Size())
|
|
}
|
|
|
|
func simdRegBySize(reg int16, size int64) int16 {
|
|
switch size {
|
|
case 16:
|
|
return reg
|
|
case 32:
|
|
return reg + (x86.REG_Y0 - x86.REG_X0)
|
|
case 64:
|
|
return reg + (x86.REG_Z0 - x86.REG_X0)
|
|
}
|
|
panic("simdRegBySize: bad size")
|
|
}
|
|
|
|
// XXX k mask
|
|
func maskReg(v *ssa.Value) int16 {
|
|
t := v.Type
|
|
if !t.IsSIMD() {
|
|
base.Fatalf("maskReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
|
|
}
|
|
switch t.Size() {
|
|
case 8:
|
|
return v.Reg()
|
|
}
|
|
panic("unreachable")
|
|
}
|
|
|
|
// XXX k mask + vec
|
|
func simdOrMaskReg(v *ssa.Value) int16 {
|
|
t := v.Type
|
|
if t.Size() <= 8 {
|
|
return maskReg(v)
|
|
}
|
|
return simdReg(v)
|
|
}
|
|
|
|
// XXX this is used for shift operations only.
|
|
// regalloc will issue OpCopy with incorrect type, but the assigned
|
|
// register should be correct, and this function is merely checking
|
|
// the sanity of this part.
|
|
func simdCheckRegOnly(v *ssa.Value, regStart, regEnd int16) int16 {
|
|
if v.Reg() > regEnd || v.Reg() < regStart {
|
|
panic("simdCheckRegOnly: not the desired register")
|
|
}
|
|
return v.Reg()
|
|
}
|