mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
Change-Id: I8d10399ba71e5fa97ead06a717fc972c806c0856 Reviewed-on: https://go-review.googlesource.com/c/go/+/715042 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@golang.org> Auto-Submit: Jorropo <jorropo.pgm@gmail.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
742 lines
22 KiB
Go
742 lines
22 KiB
Go
// Copyright 2015 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package ssa
|
|
|
|
import (
|
|
"cmd/compile/internal/abi"
|
|
"cmd/compile/internal/base"
|
|
"cmd/compile/internal/ir"
|
|
"cmd/compile/internal/types"
|
|
"cmd/internal/obj"
|
|
"cmd/internal/src"
|
|
)
|
|
|
|
// A Config holds readonly compilation information.
|
|
// It is created once, early during compilation,
|
|
// and shared across all compilations.
|
|
type Config struct {
|
|
arch string // "amd64", etc.
|
|
PtrSize int64 // 4 or 8; copy of cmd/internal/sys.Arch.PtrSize
|
|
RegSize int64 // 4 or 8; copy of cmd/internal/sys.Arch.RegSize
|
|
Types Types
|
|
lowerBlock blockRewriter // block lowering function, first round
|
|
lowerValue valueRewriter // value lowering function, first round
|
|
lateLowerBlock blockRewriter // block lowering function that needs to be run after the first round; only used on some architectures
|
|
lateLowerValue valueRewriter // value lowering function that needs to be run after the first round; only used on some architectures
|
|
splitLoad valueRewriter // function for splitting merged load ops; only used on some architectures
|
|
registers []Register // machine registers
|
|
gpRegMask regMask // general purpose integer register mask
|
|
fpRegMask regMask // floating point register mask
|
|
fp32RegMask regMask // floating point register mask
|
|
fp64RegMask regMask // floating point register mask
|
|
specialRegMask regMask // special register mask
|
|
intParamRegs []int8 // register numbers of integer param (in/out) registers
|
|
floatParamRegs []int8 // register numbers of floating param (in/out) registers
|
|
ABI1 *abi.ABIConfig // "ABIInternal" under development // TODO change comment when this becomes current
|
|
ABI0 *abi.ABIConfig
|
|
FPReg int8 // register number of frame pointer, -1 if not used
|
|
LinkReg int8 // register number of link register if it is a general purpose register, -1 if not used
|
|
hasGReg bool // has hardware g register
|
|
ctxt *obj.Link // Generic arch information
|
|
optimize bool // Do optimization
|
|
useAvg bool // Use optimizations that need Avg* operations
|
|
useHmul bool // Use optimizations that need Hmul* operations
|
|
SoftFloat bool //
|
|
Race bool // race detector enabled
|
|
BigEndian bool //
|
|
unalignedOK bool // Unaligned loads/stores are ok
|
|
haveBswap64 bool // architecture implements Bswap64
|
|
haveBswap32 bool // architecture implements Bswap32
|
|
haveBswap16 bool // architecture implements Bswap16
|
|
haveCondSelect bool // architecture implements CondSelect
|
|
|
|
// mulRecipes[x] = function to build v * x from v.
|
|
mulRecipes map[int64]mulRecipe
|
|
}
|
|
|
|
type mulRecipe struct {
|
|
cost int
|
|
build func(*Value, *Value) *Value // build(m, v) returns v * x built at m.
|
|
}
|
|
|
|
type (
|
|
blockRewriter func(*Block) bool
|
|
valueRewriter func(*Value) bool
|
|
)
|
|
|
|
type Types struct {
|
|
Bool *types.Type
|
|
Int8 *types.Type
|
|
Int16 *types.Type
|
|
Int32 *types.Type
|
|
Int64 *types.Type
|
|
UInt8 *types.Type
|
|
UInt16 *types.Type
|
|
UInt32 *types.Type
|
|
UInt64 *types.Type
|
|
Int *types.Type
|
|
Float32 *types.Type
|
|
Float64 *types.Type
|
|
UInt *types.Type
|
|
Uintptr *types.Type
|
|
String *types.Type
|
|
BytePtr *types.Type // TODO: use unsafe.Pointer instead?
|
|
Int32Ptr *types.Type
|
|
UInt32Ptr *types.Type
|
|
IntPtr *types.Type
|
|
UintptrPtr *types.Type
|
|
Float32Ptr *types.Type
|
|
Float64Ptr *types.Type
|
|
BytePtrPtr *types.Type
|
|
}
|
|
|
|
// NewTypes creates and populates a Types.
|
|
func NewTypes() *Types {
|
|
t := new(Types)
|
|
t.SetTypPtrs()
|
|
return t
|
|
}
|
|
|
|
// SetTypPtrs populates t.
|
|
func (t *Types) SetTypPtrs() {
|
|
t.Bool = types.Types[types.TBOOL]
|
|
t.Int8 = types.Types[types.TINT8]
|
|
t.Int16 = types.Types[types.TINT16]
|
|
t.Int32 = types.Types[types.TINT32]
|
|
t.Int64 = types.Types[types.TINT64]
|
|
t.UInt8 = types.Types[types.TUINT8]
|
|
t.UInt16 = types.Types[types.TUINT16]
|
|
t.UInt32 = types.Types[types.TUINT32]
|
|
t.UInt64 = types.Types[types.TUINT64]
|
|
t.Int = types.Types[types.TINT]
|
|
t.Float32 = types.Types[types.TFLOAT32]
|
|
t.Float64 = types.Types[types.TFLOAT64]
|
|
t.UInt = types.Types[types.TUINT]
|
|
t.Uintptr = types.Types[types.TUINTPTR]
|
|
t.String = types.Types[types.TSTRING]
|
|
t.BytePtr = types.NewPtr(types.Types[types.TUINT8])
|
|
t.Int32Ptr = types.NewPtr(types.Types[types.TINT32])
|
|
t.UInt32Ptr = types.NewPtr(types.Types[types.TUINT32])
|
|
t.IntPtr = types.NewPtr(types.Types[types.TINT])
|
|
t.UintptrPtr = types.NewPtr(types.Types[types.TUINTPTR])
|
|
t.Float32Ptr = types.NewPtr(types.Types[types.TFLOAT32])
|
|
t.Float64Ptr = types.NewPtr(types.Types[types.TFLOAT64])
|
|
t.BytePtrPtr = types.NewPtr(types.NewPtr(types.Types[types.TUINT8]))
|
|
}
|
|
|
|
type Logger interface {
|
|
// Logf logs a message from the compiler.
|
|
Logf(string, ...interface{})
|
|
|
|
// Log reports whether logging is not a no-op
|
|
// some logging calls account for more than a few heap allocations.
|
|
Log() bool
|
|
|
|
// Fatalf reports a compiler error and exits.
|
|
Fatalf(pos src.XPos, msg string, args ...interface{})
|
|
|
|
// Warnl writes compiler messages in the form expected by "errorcheck" tests
|
|
Warnl(pos src.XPos, fmt_ string, args ...interface{})
|
|
|
|
// Forwards the Debug flags from gc
|
|
Debug_checknil() bool
|
|
}
|
|
|
|
type Frontend interface {
|
|
Logger
|
|
|
|
// StringData returns a symbol pointing to the given string's contents.
|
|
StringData(string) *obj.LSym
|
|
|
|
// Given the name for a compound type, returns the name we should use
|
|
// for the parts of that compound type.
|
|
SplitSlot(parent *LocalSlot, suffix string, offset int64, t *types.Type) LocalSlot
|
|
|
|
// Syslook returns a symbol of the runtime function/variable with the
|
|
// given name.
|
|
Syslook(string) *obj.LSym
|
|
|
|
// UseWriteBarrier reports whether write barrier is enabled
|
|
UseWriteBarrier() bool
|
|
|
|
// Func returns the ir.Func of the function being compiled.
|
|
Func() *ir.Func
|
|
}
|
|
|
|
// NewConfig returns a new configuration object for the given architecture.
|
|
func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat bool) *Config {
|
|
c := &Config{arch: arch, Types: types}
|
|
c.useAvg = true
|
|
c.useHmul = true
|
|
switch arch {
|
|
case "amd64":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockAMD64
|
|
c.lowerValue = rewriteValueAMD64
|
|
c.lateLowerBlock = rewriteBlockAMD64latelower
|
|
c.lateLowerValue = rewriteValueAMD64latelower
|
|
c.splitLoad = rewriteValueAMD64splitload
|
|
c.registers = registersAMD64[:]
|
|
c.gpRegMask = gpRegMaskAMD64
|
|
c.fpRegMask = fpRegMaskAMD64
|
|
c.specialRegMask = specialRegMaskAMD64
|
|
c.intParamRegs = paramIntRegAMD64
|
|
c.floatParamRegs = paramFloatRegAMD64
|
|
c.FPReg = framepointerRegAMD64
|
|
c.LinkReg = linkRegAMD64
|
|
c.hasGReg = true
|
|
c.unalignedOK = true
|
|
c.haveBswap64 = true
|
|
c.haveBswap32 = true
|
|
c.haveBswap16 = true
|
|
c.haveCondSelect = true
|
|
case "386":
|
|
c.PtrSize = 4
|
|
c.RegSize = 4
|
|
c.lowerBlock = rewriteBlock386
|
|
c.lowerValue = rewriteValue386
|
|
c.splitLoad = rewriteValue386splitload
|
|
c.registers = registers386[:]
|
|
c.gpRegMask = gpRegMask386
|
|
c.fpRegMask = fpRegMask386
|
|
c.FPReg = framepointerReg386
|
|
c.LinkReg = linkReg386
|
|
c.hasGReg = false
|
|
c.unalignedOK = true
|
|
c.haveBswap32 = true
|
|
c.haveBswap16 = true
|
|
case "arm":
|
|
c.PtrSize = 4
|
|
c.RegSize = 4
|
|
c.lowerBlock = rewriteBlockARM
|
|
c.lowerValue = rewriteValueARM
|
|
c.registers = registersARM[:]
|
|
c.gpRegMask = gpRegMaskARM
|
|
c.fpRegMask = fpRegMaskARM
|
|
c.FPReg = framepointerRegARM
|
|
c.LinkReg = linkRegARM
|
|
c.hasGReg = true
|
|
case "arm64":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockARM64
|
|
c.lowerValue = rewriteValueARM64
|
|
c.lateLowerBlock = rewriteBlockARM64latelower
|
|
c.lateLowerValue = rewriteValueARM64latelower
|
|
c.registers = registersARM64[:]
|
|
c.gpRegMask = gpRegMaskARM64
|
|
c.fpRegMask = fpRegMaskARM64
|
|
c.intParamRegs = paramIntRegARM64
|
|
c.floatParamRegs = paramFloatRegARM64
|
|
c.FPReg = framepointerRegARM64
|
|
c.LinkReg = linkRegARM64
|
|
c.hasGReg = true
|
|
c.unalignedOK = true
|
|
c.haveBswap64 = true
|
|
c.haveBswap32 = true
|
|
c.haveBswap16 = true
|
|
c.haveCondSelect = true
|
|
case "ppc64":
|
|
c.BigEndian = true
|
|
fallthrough
|
|
case "ppc64le":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockPPC64
|
|
c.lowerValue = rewriteValuePPC64
|
|
c.lateLowerBlock = rewriteBlockPPC64latelower
|
|
c.lateLowerValue = rewriteValuePPC64latelower
|
|
c.registers = registersPPC64[:]
|
|
c.gpRegMask = gpRegMaskPPC64
|
|
c.fpRegMask = fpRegMaskPPC64
|
|
c.specialRegMask = specialRegMaskPPC64
|
|
c.intParamRegs = paramIntRegPPC64
|
|
c.floatParamRegs = paramFloatRegPPC64
|
|
c.FPReg = framepointerRegPPC64
|
|
c.LinkReg = linkRegPPC64
|
|
c.hasGReg = true
|
|
c.unalignedOK = true
|
|
// Note: ppc64 has register bswap ops only when GOPPC64>=10.
|
|
// But it has bswap+load and bswap+store ops for all ppc64 variants.
|
|
// That is the sense we're using them here - they are only used
|
|
// in contexts where they can be merged with a load or store.
|
|
c.haveBswap64 = true
|
|
c.haveBswap32 = true
|
|
c.haveBswap16 = true
|
|
c.haveCondSelect = true
|
|
case "mips64":
|
|
c.BigEndian = true
|
|
fallthrough
|
|
case "mips64le":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockMIPS64
|
|
c.lowerValue = rewriteValueMIPS64
|
|
c.lateLowerBlock = rewriteBlockMIPS64latelower
|
|
c.lateLowerValue = rewriteValueMIPS64latelower
|
|
c.registers = registersMIPS64[:]
|
|
c.gpRegMask = gpRegMaskMIPS64
|
|
c.fpRegMask = fpRegMaskMIPS64
|
|
c.specialRegMask = specialRegMaskMIPS64
|
|
c.FPReg = framepointerRegMIPS64
|
|
c.LinkReg = linkRegMIPS64
|
|
c.hasGReg = true
|
|
case "loong64":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockLOONG64
|
|
c.lowerValue = rewriteValueLOONG64
|
|
c.lateLowerBlock = rewriteBlockLOONG64latelower
|
|
c.lateLowerValue = rewriteValueLOONG64latelower
|
|
c.registers = registersLOONG64[:]
|
|
c.gpRegMask = gpRegMaskLOONG64
|
|
c.fpRegMask = fpRegMaskLOONG64
|
|
c.intParamRegs = paramIntRegLOONG64
|
|
c.floatParamRegs = paramFloatRegLOONG64
|
|
c.FPReg = framepointerRegLOONG64
|
|
c.LinkReg = linkRegLOONG64
|
|
c.hasGReg = true
|
|
c.unalignedOK = true
|
|
c.haveCondSelect = true
|
|
case "s390x":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockS390X
|
|
c.lowerValue = rewriteValueS390X
|
|
c.registers = registersS390X[:]
|
|
c.gpRegMask = gpRegMaskS390X
|
|
c.fpRegMask = fpRegMaskS390X
|
|
c.FPReg = framepointerRegS390X
|
|
c.LinkReg = linkRegS390X
|
|
c.hasGReg = true
|
|
c.BigEndian = true
|
|
c.unalignedOK = true
|
|
c.haveBswap64 = true
|
|
c.haveBswap32 = true
|
|
c.haveBswap16 = true // only for loads&stores, see ppc64 comment
|
|
case "mips":
|
|
c.BigEndian = true
|
|
fallthrough
|
|
case "mipsle":
|
|
c.PtrSize = 4
|
|
c.RegSize = 4
|
|
c.lowerBlock = rewriteBlockMIPS
|
|
c.lowerValue = rewriteValueMIPS
|
|
c.registers = registersMIPS[:]
|
|
c.gpRegMask = gpRegMaskMIPS
|
|
c.fpRegMask = fpRegMaskMIPS
|
|
c.specialRegMask = specialRegMaskMIPS
|
|
c.FPReg = framepointerRegMIPS
|
|
c.LinkReg = linkRegMIPS
|
|
c.hasGReg = true
|
|
case "riscv64":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockRISCV64
|
|
c.lowerValue = rewriteValueRISCV64
|
|
c.lateLowerBlock = rewriteBlockRISCV64latelower
|
|
c.lateLowerValue = rewriteValueRISCV64latelower
|
|
c.registers = registersRISCV64[:]
|
|
c.gpRegMask = gpRegMaskRISCV64
|
|
c.fpRegMask = fpRegMaskRISCV64
|
|
c.intParamRegs = paramIntRegRISCV64
|
|
c.floatParamRegs = paramFloatRegRISCV64
|
|
c.FPReg = framepointerRegRISCV64
|
|
c.hasGReg = true
|
|
case "wasm":
|
|
c.PtrSize = 8
|
|
c.RegSize = 8
|
|
c.lowerBlock = rewriteBlockWasm
|
|
c.lowerValue = rewriteValueWasm
|
|
c.registers = registersWasm[:]
|
|
c.gpRegMask = gpRegMaskWasm
|
|
c.fpRegMask = fpRegMaskWasm
|
|
c.fp32RegMask = fp32RegMaskWasm
|
|
c.fp64RegMask = fp64RegMaskWasm
|
|
c.FPReg = framepointerRegWasm
|
|
c.LinkReg = linkRegWasm
|
|
c.hasGReg = true
|
|
c.useAvg = false
|
|
c.useHmul = false
|
|
c.unalignedOK = true
|
|
c.haveCondSelect = true
|
|
default:
|
|
ctxt.Diag("arch %s not implemented", arch)
|
|
}
|
|
c.ctxt = ctxt
|
|
c.optimize = optimize
|
|
c.SoftFloat = softfloat
|
|
if softfloat {
|
|
c.floatParamRegs = nil // no FP registers in softfloat mode
|
|
}
|
|
|
|
c.ABI0 = abi.NewABIConfig(0, 0, ctxt.Arch.FixedFrameSize, 0)
|
|
c.ABI1 = abi.NewABIConfig(len(c.intParamRegs), len(c.floatParamRegs), ctxt.Arch.FixedFrameSize, 1)
|
|
|
|
if ctxt.Flag_shared {
|
|
// LoweredWB is secretly a CALL and CALLs on 386 in
|
|
// shared mode get rewritten by obj6.go to go through
|
|
// the GOT, which clobbers BX.
|
|
opcodeTable[Op386LoweredWB].reg.clobbers |= 1 << 3 // BX
|
|
}
|
|
|
|
c.buildRecipes(arch)
|
|
|
|
return c
|
|
}
|
|
|
|
func (c *Config) Ctxt() *obj.Link { return c.ctxt }
|
|
|
|
func (c *Config) haveByteSwap(size int64) bool {
|
|
switch size {
|
|
case 8:
|
|
return c.haveBswap64
|
|
case 4:
|
|
return c.haveBswap32
|
|
case 2:
|
|
return c.haveBswap16
|
|
default:
|
|
base.Fatalf("bad size %d\n", size)
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (c *Config) buildRecipes(arch string) {
|
|
// Information for strength-reducing multiplies.
|
|
type linearCombo struct {
|
|
// we can compute a*x+b*y in one instruction
|
|
a, b int64
|
|
// cost, in arbitrary units (tenths of cycles, usually)
|
|
cost int
|
|
// builds SSA value for a*x+b*y. Use the position
|
|
// information from m.
|
|
build func(m, x, y *Value) *Value
|
|
}
|
|
|
|
// List all the linear combination instructions we have.
|
|
var linearCombos []linearCombo
|
|
r := func(a, b int64, cost int, build func(m, x, y *Value) *Value) {
|
|
linearCombos = append(linearCombos, linearCombo{a: a, b: b, cost: cost, build: build})
|
|
}
|
|
var mulCost int
|
|
switch arch {
|
|
case "amd64":
|
|
// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
|
|
// 1 - addq, shlq, leaq, negq, subq
|
|
// 3 - imulq
|
|
// These costs limit the rewrites to two instructions.
|
|
// Operations which have to happen in place (and thus
|
|
// may require a reg-reg move) score slightly higher.
|
|
mulCost = 30
|
|
// add
|
|
r(1, 1, 10,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue2(m.Pos, OpAMD64ADDQ, m.Type, x, y)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64ADDL
|
|
}
|
|
return v
|
|
})
|
|
// neg
|
|
r(-1, 0, 11,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue1(m.Pos, OpAMD64NEGQ, m.Type, x)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64NEGL
|
|
}
|
|
return v
|
|
})
|
|
// sub
|
|
r(1, -1, 11,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue2(m.Pos, OpAMD64SUBQ, m.Type, x, y)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64SUBL
|
|
}
|
|
return v
|
|
})
|
|
// lea
|
|
r(1, 2, 10,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ2, m.Type, x, y)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64LEAL2
|
|
}
|
|
return v
|
|
})
|
|
r(1, 4, 10,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ4, m.Type, x, y)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64LEAL4
|
|
}
|
|
return v
|
|
})
|
|
r(1, 8, 10,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ8, m.Type, x, y)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64LEAL8
|
|
}
|
|
return v
|
|
})
|
|
// regular shifts
|
|
for i := 2; i < 64; i++ {
|
|
r(1<<i, 0, 11,
|
|
func(m, x, y *Value) *Value {
|
|
v := m.Block.NewValue1I(m.Pos, OpAMD64SHLQconst, m.Type, int64(i), x)
|
|
if m.Type.Size() == 4 {
|
|
v.Op = OpAMD64SHLLconst
|
|
}
|
|
return v
|
|
})
|
|
}
|
|
|
|
case "arm64":
|
|
// Rationale (for M2 ultra):
|
|
// - multiply is 3 cycles.
|
|
// - add/neg/sub/shift are 1 cycle.
|
|
// - add/neg/sub+shiftLL are 2 cycles.
|
|
// We break ties against the multiply because using a
|
|
// multiply also needs to load the constant into a register.
|
|
// (It's 3 cycles and 2 instructions either way, but the
|
|
// linear combo one might use 1 less register.)
|
|
// The multiply constant might get lifted out of a loop though. Hmm....
|
|
// Other arm64 chips have different tradeoffs.
|
|
// Some chip's add+shift instructions are 1 cycle for shifts up to 4
|
|
// and 2 cycles for shifts bigger than 4. So weight the larger shifts
|
|
// a bit more.
|
|
// TODO: figure out a happy medium.
|
|
mulCost = 35
|
|
// add
|
|
r(1, 1, 10,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2(m.Pos, OpARM64ADD, m.Type, x, y)
|
|
})
|
|
// neg
|
|
r(-1, 0, 10,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue1(m.Pos, OpARM64NEG, m.Type, x)
|
|
})
|
|
// sub
|
|
r(1, -1, 10,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2(m.Pos, OpARM64SUB, m.Type, x, y)
|
|
})
|
|
// regular shifts
|
|
for i := 1; i < 64; i++ {
|
|
c := 10
|
|
if i == 1 {
|
|
// Prefer x<<1 over x+x.
|
|
// Note that we eventually reverse this decision in ARM64latelower.rules,
|
|
// but this makes shift combining rules in ARM64.rules simpler.
|
|
c--
|
|
}
|
|
r(1<<i, 0, c,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue1I(m.Pos, OpARM64SLLconst, m.Type, int64(i), x)
|
|
})
|
|
}
|
|
// ADDshiftLL
|
|
for i := 1; i < 64; i++ {
|
|
c := 20
|
|
if i > 4 {
|
|
c++
|
|
}
|
|
r(1, 1<<i, c,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2I(m.Pos, OpARM64ADDshiftLL, m.Type, int64(i), x, y)
|
|
})
|
|
}
|
|
// NEGshiftLL
|
|
for i := 1; i < 64; i++ {
|
|
c := 20
|
|
if i > 4 {
|
|
c++
|
|
}
|
|
r(-1<<i, 0, c,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue1I(m.Pos, OpARM64NEGshiftLL, m.Type, int64(i), x)
|
|
})
|
|
}
|
|
// SUBshiftLL
|
|
for i := 1; i < 64; i++ {
|
|
c := 20
|
|
if i > 4 {
|
|
c++
|
|
}
|
|
r(1, -1<<i, c,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2I(m.Pos, OpARM64SUBshiftLL, m.Type, int64(i), x, y)
|
|
})
|
|
}
|
|
case "loong64":
|
|
// - multiply is 4 cycles.
|
|
// - add/sub/shift/alsl are 1 cycle.
|
|
// On loong64, using a multiply also needs to load the constant into a register.
|
|
// TODO: figure out a happy medium.
|
|
mulCost = 45
|
|
|
|
// add
|
|
r(1, 1, 10,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2(m.Pos, OpLOONG64ADDV, m.Type, x, y)
|
|
})
|
|
// neg
|
|
r(-1, 0, 10,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue1(m.Pos, OpLOONG64NEGV, m.Type, x)
|
|
})
|
|
// sub
|
|
r(1, -1, 10,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2(m.Pos, OpLOONG64SUBV, m.Type, x, y)
|
|
})
|
|
|
|
// regular shifts
|
|
for i := 1; i < 64; i++ {
|
|
c := 10
|
|
if i == 1 {
|
|
// Prefer x<<1 over x+x.
|
|
// Note that we eventually reverse this decision in LOONG64latelower.rules,
|
|
// but this makes shift combining rules in LOONG64.rules simpler.
|
|
c--
|
|
}
|
|
r(1<<i, 0, c,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue1I(m.Pos, OpLOONG64SLLVconst, m.Type, int64(i), x)
|
|
})
|
|
}
|
|
|
|
// ADDshiftLLV
|
|
for i := 1; i < 5; i++ {
|
|
c := 10
|
|
r(1, 1<<i, c,
|
|
func(m, x, y *Value) *Value {
|
|
return m.Block.NewValue2I(m.Pos, OpLOONG64ADDshiftLLV, m.Type, int64(i), x, y)
|
|
})
|
|
}
|
|
}
|
|
|
|
c.mulRecipes = map[int64]mulRecipe{}
|
|
|
|
// Single-instruction recipes.
|
|
// The only option for the input value(s) is v.
|
|
for _, combo := range linearCombos {
|
|
x := combo.a + combo.b
|
|
cost := combo.cost
|
|
old := c.mulRecipes[x]
|
|
if (old.build == nil || cost < old.cost) && cost < mulCost {
|
|
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
|
|
return combo.build(m, v, v)
|
|
}}
|
|
}
|
|
}
|
|
// Two-instruction recipes.
|
|
// A: Both of the outer's inputs are from the same single-instruction recipe.
|
|
// B: First input is v and the second is from a single-instruction recipe.
|
|
// C: Second input is v and the first is from a single-instruction recipe.
|
|
// A is slightly preferred because it often needs 1 less register, so it
|
|
// goes first.
|
|
|
|
// A
|
|
for _, inner := range linearCombos {
|
|
for _, outer := range linearCombos {
|
|
x := (inner.a + inner.b) * (outer.a + outer.b)
|
|
cost := inner.cost + outer.cost
|
|
old := c.mulRecipes[x]
|
|
if (old.build == nil || cost < old.cost) && cost < mulCost {
|
|
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
|
|
v = inner.build(m, v, v)
|
|
return outer.build(m, v, v)
|
|
}}
|
|
}
|
|
}
|
|
}
|
|
|
|
// B
|
|
for _, inner := range linearCombos {
|
|
for _, outer := range linearCombos {
|
|
x := outer.a + outer.b*(inner.a+inner.b)
|
|
cost := inner.cost + outer.cost
|
|
old := c.mulRecipes[x]
|
|
if (old.build == nil || cost < old.cost) && cost < mulCost {
|
|
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
|
|
return outer.build(m, v, inner.build(m, v, v))
|
|
}}
|
|
}
|
|
}
|
|
}
|
|
|
|
// C
|
|
for _, inner := range linearCombos {
|
|
for _, outer := range linearCombos {
|
|
x := outer.a*(inner.a+inner.b) + outer.b
|
|
cost := inner.cost + outer.cost
|
|
old := c.mulRecipes[x]
|
|
if (old.build == nil || cost < old.cost) && cost < mulCost {
|
|
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
|
|
return outer.build(m, inner.build(m, v, v), v)
|
|
}}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Currently we only process 3 linear combination instructions for loong64.
|
|
if arch == "loong64" {
|
|
// Three-instruction recipes.
|
|
// D: The first and the second are all single-instruction recipes, and they are also the third's inputs.
|
|
// E: The first single-instruction is the second's input, and the second is the third's input.
|
|
|
|
// D
|
|
for _, first := range linearCombos {
|
|
for _, second := range linearCombos {
|
|
for _, third := range linearCombos {
|
|
x := third.a*(first.a+first.b) + third.b*(second.a+second.b)
|
|
cost := first.cost + second.cost + third.cost
|
|
old := c.mulRecipes[x]
|
|
if (old.build == nil || cost < old.cost) && cost < mulCost {
|
|
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
|
|
v1 := first.build(m, v, v)
|
|
v2 := second.build(m, v, v)
|
|
return third.build(m, v1, v2)
|
|
}}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// E
|
|
for _, first := range linearCombos {
|
|
for _, second := range linearCombos {
|
|
for _, third := range linearCombos {
|
|
x := third.a*(second.a*(first.a+first.b)+second.b) + third.b
|
|
cost := first.cost + second.cost + third.cost
|
|
old := c.mulRecipes[x]
|
|
if (old.build == nil || cost < old.cost) && cost < mulCost {
|
|
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
|
|
v1 := first.build(m, v, v)
|
|
v2 := second.build(m, v1, v)
|
|
return third.build(m, v2, v)
|
|
}}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// These cases should be handled specially by rewrite rules.
|
|
// (Otherwise v * 1 == (neg (neg v)))
|
|
delete(c.mulRecipes, 0)
|
|
delete(c.mulRecipes, 1)
|
|
|
|
// Currently:
|
|
// len(c.mulRecipes) == 5984 on arm64
|
|
// 680 on amd64
|
|
// 9738 on loong64
|
|
// This function takes ~2.5ms on arm64.
|
|
//println(len(c.mulRecipes))
|
|
}
|