go/src/cmd/compile/internal/ssa/config.go
Jorropo 2d33a456c6 cmd/compile: move branchelim supported arches to Config
Change-Id: I8d10399ba71e5fa97ead06a717fc972c806c0856
Reviewed-on: https://go-review.googlesource.com/c/go/+/715042
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Jorropo <jorropo.pgm@gmail.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2025-10-28 07:51:47 -07:00

742 lines
22 KiB
Go

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"cmd/compile/internal/abi"
"cmd/compile/internal/base"
"cmd/compile/internal/ir"
"cmd/compile/internal/types"
"cmd/internal/obj"
"cmd/internal/src"
)
// A Config holds readonly compilation information.
// It is created once, early during compilation,
// and shared across all compilations.
type Config struct {
arch string // "amd64", etc.
PtrSize int64 // 4 or 8; copy of cmd/internal/sys.Arch.PtrSize
RegSize int64 // 4 or 8; copy of cmd/internal/sys.Arch.RegSize
Types Types
lowerBlock blockRewriter // block lowering function, first round
lowerValue valueRewriter // value lowering function, first round
lateLowerBlock blockRewriter // block lowering function that needs to be run after the first round; only used on some architectures
lateLowerValue valueRewriter // value lowering function that needs to be run after the first round; only used on some architectures
splitLoad valueRewriter // function for splitting merged load ops; only used on some architectures
registers []Register // machine registers
gpRegMask regMask // general purpose integer register mask
fpRegMask regMask // floating point register mask
fp32RegMask regMask // floating point register mask
fp64RegMask regMask // floating point register mask
specialRegMask regMask // special register mask
intParamRegs []int8 // register numbers of integer param (in/out) registers
floatParamRegs []int8 // register numbers of floating param (in/out) registers
ABI1 *abi.ABIConfig // "ABIInternal" under development // TODO change comment when this becomes current
ABI0 *abi.ABIConfig
FPReg int8 // register number of frame pointer, -1 if not used
LinkReg int8 // register number of link register if it is a general purpose register, -1 if not used
hasGReg bool // has hardware g register
ctxt *obj.Link // Generic arch information
optimize bool // Do optimization
useAvg bool // Use optimizations that need Avg* operations
useHmul bool // Use optimizations that need Hmul* operations
SoftFloat bool //
Race bool // race detector enabled
BigEndian bool //
unalignedOK bool // Unaligned loads/stores are ok
haveBswap64 bool // architecture implements Bswap64
haveBswap32 bool // architecture implements Bswap32
haveBswap16 bool // architecture implements Bswap16
haveCondSelect bool // architecture implements CondSelect
// mulRecipes[x] = function to build v * x from v.
mulRecipes map[int64]mulRecipe
}
type mulRecipe struct {
cost int
build func(*Value, *Value) *Value // build(m, v) returns v * x built at m.
}
type (
blockRewriter func(*Block) bool
valueRewriter func(*Value) bool
)
type Types struct {
Bool *types.Type
Int8 *types.Type
Int16 *types.Type
Int32 *types.Type
Int64 *types.Type
UInt8 *types.Type
UInt16 *types.Type
UInt32 *types.Type
UInt64 *types.Type
Int *types.Type
Float32 *types.Type
Float64 *types.Type
UInt *types.Type
Uintptr *types.Type
String *types.Type
BytePtr *types.Type // TODO: use unsafe.Pointer instead?
Int32Ptr *types.Type
UInt32Ptr *types.Type
IntPtr *types.Type
UintptrPtr *types.Type
Float32Ptr *types.Type
Float64Ptr *types.Type
BytePtrPtr *types.Type
}
// NewTypes creates and populates a Types.
func NewTypes() *Types {
t := new(Types)
t.SetTypPtrs()
return t
}
// SetTypPtrs populates t.
func (t *Types) SetTypPtrs() {
t.Bool = types.Types[types.TBOOL]
t.Int8 = types.Types[types.TINT8]
t.Int16 = types.Types[types.TINT16]
t.Int32 = types.Types[types.TINT32]
t.Int64 = types.Types[types.TINT64]
t.UInt8 = types.Types[types.TUINT8]
t.UInt16 = types.Types[types.TUINT16]
t.UInt32 = types.Types[types.TUINT32]
t.UInt64 = types.Types[types.TUINT64]
t.Int = types.Types[types.TINT]
t.Float32 = types.Types[types.TFLOAT32]
t.Float64 = types.Types[types.TFLOAT64]
t.UInt = types.Types[types.TUINT]
t.Uintptr = types.Types[types.TUINTPTR]
t.String = types.Types[types.TSTRING]
t.BytePtr = types.NewPtr(types.Types[types.TUINT8])
t.Int32Ptr = types.NewPtr(types.Types[types.TINT32])
t.UInt32Ptr = types.NewPtr(types.Types[types.TUINT32])
t.IntPtr = types.NewPtr(types.Types[types.TINT])
t.UintptrPtr = types.NewPtr(types.Types[types.TUINTPTR])
t.Float32Ptr = types.NewPtr(types.Types[types.TFLOAT32])
t.Float64Ptr = types.NewPtr(types.Types[types.TFLOAT64])
t.BytePtrPtr = types.NewPtr(types.NewPtr(types.Types[types.TUINT8]))
}
type Logger interface {
// Logf logs a message from the compiler.
Logf(string, ...interface{})
// Log reports whether logging is not a no-op
// some logging calls account for more than a few heap allocations.
Log() bool
// Fatalf reports a compiler error and exits.
Fatalf(pos src.XPos, msg string, args ...interface{})
// Warnl writes compiler messages in the form expected by "errorcheck" tests
Warnl(pos src.XPos, fmt_ string, args ...interface{})
// Forwards the Debug flags from gc
Debug_checknil() bool
}
type Frontend interface {
Logger
// StringData returns a symbol pointing to the given string's contents.
StringData(string) *obj.LSym
// Given the name for a compound type, returns the name we should use
// for the parts of that compound type.
SplitSlot(parent *LocalSlot, suffix string, offset int64, t *types.Type) LocalSlot
// Syslook returns a symbol of the runtime function/variable with the
// given name.
Syslook(string) *obj.LSym
// UseWriteBarrier reports whether write barrier is enabled
UseWriteBarrier() bool
// Func returns the ir.Func of the function being compiled.
Func() *ir.Func
}
// NewConfig returns a new configuration object for the given architecture.
func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat bool) *Config {
c := &Config{arch: arch, Types: types}
c.useAvg = true
c.useHmul = true
switch arch {
case "amd64":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockAMD64
c.lowerValue = rewriteValueAMD64
c.lateLowerBlock = rewriteBlockAMD64latelower
c.lateLowerValue = rewriteValueAMD64latelower
c.splitLoad = rewriteValueAMD64splitload
c.registers = registersAMD64[:]
c.gpRegMask = gpRegMaskAMD64
c.fpRegMask = fpRegMaskAMD64
c.specialRegMask = specialRegMaskAMD64
c.intParamRegs = paramIntRegAMD64
c.floatParamRegs = paramFloatRegAMD64
c.FPReg = framepointerRegAMD64
c.LinkReg = linkRegAMD64
c.hasGReg = true
c.unalignedOK = true
c.haveBswap64 = true
c.haveBswap32 = true
c.haveBswap16 = true
c.haveCondSelect = true
case "386":
c.PtrSize = 4
c.RegSize = 4
c.lowerBlock = rewriteBlock386
c.lowerValue = rewriteValue386
c.splitLoad = rewriteValue386splitload
c.registers = registers386[:]
c.gpRegMask = gpRegMask386
c.fpRegMask = fpRegMask386
c.FPReg = framepointerReg386
c.LinkReg = linkReg386
c.hasGReg = false
c.unalignedOK = true
c.haveBswap32 = true
c.haveBswap16 = true
case "arm":
c.PtrSize = 4
c.RegSize = 4
c.lowerBlock = rewriteBlockARM
c.lowerValue = rewriteValueARM
c.registers = registersARM[:]
c.gpRegMask = gpRegMaskARM
c.fpRegMask = fpRegMaskARM
c.FPReg = framepointerRegARM
c.LinkReg = linkRegARM
c.hasGReg = true
case "arm64":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockARM64
c.lowerValue = rewriteValueARM64
c.lateLowerBlock = rewriteBlockARM64latelower
c.lateLowerValue = rewriteValueARM64latelower
c.registers = registersARM64[:]
c.gpRegMask = gpRegMaskARM64
c.fpRegMask = fpRegMaskARM64
c.intParamRegs = paramIntRegARM64
c.floatParamRegs = paramFloatRegARM64
c.FPReg = framepointerRegARM64
c.LinkReg = linkRegARM64
c.hasGReg = true
c.unalignedOK = true
c.haveBswap64 = true
c.haveBswap32 = true
c.haveBswap16 = true
c.haveCondSelect = true
case "ppc64":
c.BigEndian = true
fallthrough
case "ppc64le":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockPPC64
c.lowerValue = rewriteValuePPC64
c.lateLowerBlock = rewriteBlockPPC64latelower
c.lateLowerValue = rewriteValuePPC64latelower
c.registers = registersPPC64[:]
c.gpRegMask = gpRegMaskPPC64
c.fpRegMask = fpRegMaskPPC64
c.specialRegMask = specialRegMaskPPC64
c.intParamRegs = paramIntRegPPC64
c.floatParamRegs = paramFloatRegPPC64
c.FPReg = framepointerRegPPC64
c.LinkReg = linkRegPPC64
c.hasGReg = true
c.unalignedOK = true
// Note: ppc64 has register bswap ops only when GOPPC64>=10.
// But it has bswap+load and bswap+store ops for all ppc64 variants.
// That is the sense we're using them here - they are only used
// in contexts where they can be merged with a load or store.
c.haveBswap64 = true
c.haveBswap32 = true
c.haveBswap16 = true
c.haveCondSelect = true
case "mips64":
c.BigEndian = true
fallthrough
case "mips64le":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockMIPS64
c.lowerValue = rewriteValueMIPS64
c.lateLowerBlock = rewriteBlockMIPS64latelower
c.lateLowerValue = rewriteValueMIPS64latelower
c.registers = registersMIPS64[:]
c.gpRegMask = gpRegMaskMIPS64
c.fpRegMask = fpRegMaskMIPS64
c.specialRegMask = specialRegMaskMIPS64
c.FPReg = framepointerRegMIPS64
c.LinkReg = linkRegMIPS64
c.hasGReg = true
case "loong64":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockLOONG64
c.lowerValue = rewriteValueLOONG64
c.lateLowerBlock = rewriteBlockLOONG64latelower
c.lateLowerValue = rewriteValueLOONG64latelower
c.registers = registersLOONG64[:]
c.gpRegMask = gpRegMaskLOONG64
c.fpRegMask = fpRegMaskLOONG64
c.intParamRegs = paramIntRegLOONG64
c.floatParamRegs = paramFloatRegLOONG64
c.FPReg = framepointerRegLOONG64
c.LinkReg = linkRegLOONG64
c.hasGReg = true
c.unalignedOK = true
c.haveCondSelect = true
case "s390x":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockS390X
c.lowerValue = rewriteValueS390X
c.registers = registersS390X[:]
c.gpRegMask = gpRegMaskS390X
c.fpRegMask = fpRegMaskS390X
c.FPReg = framepointerRegS390X
c.LinkReg = linkRegS390X
c.hasGReg = true
c.BigEndian = true
c.unalignedOK = true
c.haveBswap64 = true
c.haveBswap32 = true
c.haveBswap16 = true // only for loads&stores, see ppc64 comment
case "mips":
c.BigEndian = true
fallthrough
case "mipsle":
c.PtrSize = 4
c.RegSize = 4
c.lowerBlock = rewriteBlockMIPS
c.lowerValue = rewriteValueMIPS
c.registers = registersMIPS[:]
c.gpRegMask = gpRegMaskMIPS
c.fpRegMask = fpRegMaskMIPS
c.specialRegMask = specialRegMaskMIPS
c.FPReg = framepointerRegMIPS
c.LinkReg = linkRegMIPS
c.hasGReg = true
case "riscv64":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockRISCV64
c.lowerValue = rewriteValueRISCV64
c.lateLowerBlock = rewriteBlockRISCV64latelower
c.lateLowerValue = rewriteValueRISCV64latelower
c.registers = registersRISCV64[:]
c.gpRegMask = gpRegMaskRISCV64
c.fpRegMask = fpRegMaskRISCV64
c.intParamRegs = paramIntRegRISCV64
c.floatParamRegs = paramFloatRegRISCV64
c.FPReg = framepointerRegRISCV64
c.hasGReg = true
case "wasm":
c.PtrSize = 8
c.RegSize = 8
c.lowerBlock = rewriteBlockWasm
c.lowerValue = rewriteValueWasm
c.registers = registersWasm[:]
c.gpRegMask = gpRegMaskWasm
c.fpRegMask = fpRegMaskWasm
c.fp32RegMask = fp32RegMaskWasm
c.fp64RegMask = fp64RegMaskWasm
c.FPReg = framepointerRegWasm
c.LinkReg = linkRegWasm
c.hasGReg = true
c.useAvg = false
c.useHmul = false
c.unalignedOK = true
c.haveCondSelect = true
default:
ctxt.Diag("arch %s not implemented", arch)
}
c.ctxt = ctxt
c.optimize = optimize
c.SoftFloat = softfloat
if softfloat {
c.floatParamRegs = nil // no FP registers in softfloat mode
}
c.ABI0 = abi.NewABIConfig(0, 0, ctxt.Arch.FixedFrameSize, 0)
c.ABI1 = abi.NewABIConfig(len(c.intParamRegs), len(c.floatParamRegs), ctxt.Arch.FixedFrameSize, 1)
if ctxt.Flag_shared {
// LoweredWB is secretly a CALL and CALLs on 386 in
// shared mode get rewritten by obj6.go to go through
// the GOT, which clobbers BX.
opcodeTable[Op386LoweredWB].reg.clobbers |= 1 << 3 // BX
}
c.buildRecipes(arch)
return c
}
func (c *Config) Ctxt() *obj.Link { return c.ctxt }
func (c *Config) haveByteSwap(size int64) bool {
switch size {
case 8:
return c.haveBswap64
case 4:
return c.haveBswap32
case 2:
return c.haveBswap16
default:
base.Fatalf("bad size %d\n", size)
return false
}
}
func (c *Config) buildRecipes(arch string) {
// Information for strength-reducing multiplies.
type linearCombo struct {
// we can compute a*x+b*y in one instruction
a, b int64
// cost, in arbitrary units (tenths of cycles, usually)
cost int
// builds SSA value for a*x+b*y. Use the position
// information from m.
build func(m, x, y *Value) *Value
}
// List all the linear combination instructions we have.
var linearCombos []linearCombo
r := func(a, b int64, cost int, build func(m, x, y *Value) *Value) {
linearCombos = append(linearCombos, linearCombo{a: a, b: b, cost: cost, build: build})
}
var mulCost int
switch arch {
case "amd64":
// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
// 1 - addq, shlq, leaq, negq, subq
// 3 - imulq
// These costs limit the rewrites to two instructions.
// Operations which have to happen in place (and thus
// may require a reg-reg move) score slightly higher.
mulCost = 30
// add
r(1, 1, 10,
func(m, x, y *Value) *Value {
v := m.Block.NewValue2(m.Pos, OpAMD64ADDQ, m.Type, x, y)
if m.Type.Size() == 4 {
v.Op = OpAMD64ADDL
}
return v
})
// neg
r(-1, 0, 11,
func(m, x, y *Value) *Value {
v := m.Block.NewValue1(m.Pos, OpAMD64NEGQ, m.Type, x)
if m.Type.Size() == 4 {
v.Op = OpAMD64NEGL
}
return v
})
// sub
r(1, -1, 11,
func(m, x, y *Value) *Value {
v := m.Block.NewValue2(m.Pos, OpAMD64SUBQ, m.Type, x, y)
if m.Type.Size() == 4 {
v.Op = OpAMD64SUBL
}
return v
})
// lea
r(1, 2, 10,
func(m, x, y *Value) *Value {
v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ2, m.Type, x, y)
if m.Type.Size() == 4 {
v.Op = OpAMD64LEAL2
}
return v
})
r(1, 4, 10,
func(m, x, y *Value) *Value {
v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ4, m.Type, x, y)
if m.Type.Size() == 4 {
v.Op = OpAMD64LEAL4
}
return v
})
r(1, 8, 10,
func(m, x, y *Value) *Value {
v := m.Block.NewValue2(m.Pos, OpAMD64LEAQ8, m.Type, x, y)
if m.Type.Size() == 4 {
v.Op = OpAMD64LEAL8
}
return v
})
// regular shifts
for i := 2; i < 64; i++ {
r(1<<i, 0, 11,
func(m, x, y *Value) *Value {
v := m.Block.NewValue1I(m.Pos, OpAMD64SHLQconst, m.Type, int64(i), x)
if m.Type.Size() == 4 {
v.Op = OpAMD64SHLLconst
}
return v
})
}
case "arm64":
// Rationale (for M2 ultra):
// - multiply is 3 cycles.
// - add/neg/sub/shift are 1 cycle.
// - add/neg/sub+shiftLL are 2 cycles.
// We break ties against the multiply because using a
// multiply also needs to load the constant into a register.
// (It's 3 cycles and 2 instructions either way, but the
// linear combo one might use 1 less register.)
// The multiply constant might get lifted out of a loop though. Hmm....
// Other arm64 chips have different tradeoffs.
// Some chip's add+shift instructions are 1 cycle for shifts up to 4
// and 2 cycles for shifts bigger than 4. So weight the larger shifts
// a bit more.
// TODO: figure out a happy medium.
mulCost = 35
// add
r(1, 1, 10,
func(m, x, y *Value) *Value {
return m.Block.NewValue2(m.Pos, OpARM64ADD, m.Type, x, y)
})
// neg
r(-1, 0, 10,
func(m, x, y *Value) *Value {
return m.Block.NewValue1(m.Pos, OpARM64NEG, m.Type, x)
})
// sub
r(1, -1, 10,
func(m, x, y *Value) *Value {
return m.Block.NewValue2(m.Pos, OpARM64SUB, m.Type, x, y)
})
// regular shifts
for i := 1; i < 64; i++ {
c := 10
if i == 1 {
// Prefer x<<1 over x+x.
// Note that we eventually reverse this decision in ARM64latelower.rules,
// but this makes shift combining rules in ARM64.rules simpler.
c--
}
r(1<<i, 0, c,
func(m, x, y *Value) *Value {
return m.Block.NewValue1I(m.Pos, OpARM64SLLconst, m.Type, int64(i), x)
})
}
// ADDshiftLL
for i := 1; i < 64; i++ {
c := 20
if i > 4 {
c++
}
r(1, 1<<i, c,
func(m, x, y *Value) *Value {
return m.Block.NewValue2I(m.Pos, OpARM64ADDshiftLL, m.Type, int64(i), x, y)
})
}
// NEGshiftLL
for i := 1; i < 64; i++ {
c := 20
if i > 4 {
c++
}
r(-1<<i, 0, c,
func(m, x, y *Value) *Value {
return m.Block.NewValue1I(m.Pos, OpARM64NEGshiftLL, m.Type, int64(i), x)
})
}
// SUBshiftLL
for i := 1; i < 64; i++ {
c := 20
if i > 4 {
c++
}
r(1, -1<<i, c,
func(m, x, y *Value) *Value {
return m.Block.NewValue2I(m.Pos, OpARM64SUBshiftLL, m.Type, int64(i), x, y)
})
}
case "loong64":
// - multiply is 4 cycles.
// - add/sub/shift/alsl are 1 cycle.
// On loong64, using a multiply also needs to load the constant into a register.
// TODO: figure out a happy medium.
mulCost = 45
// add
r(1, 1, 10,
func(m, x, y *Value) *Value {
return m.Block.NewValue2(m.Pos, OpLOONG64ADDV, m.Type, x, y)
})
// neg
r(-1, 0, 10,
func(m, x, y *Value) *Value {
return m.Block.NewValue1(m.Pos, OpLOONG64NEGV, m.Type, x)
})
// sub
r(1, -1, 10,
func(m, x, y *Value) *Value {
return m.Block.NewValue2(m.Pos, OpLOONG64SUBV, m.Type, x, y)
})
// regular shifts
for i := 1; i < 64; i++ {
c := 10
if i == 1 {
// Prefer x<<1 over x+x.
// Note that we eventually reverse this decision in LOONG64latelower.rules,
// but this makes shift combining rules in LOONG64.rules simpler.
c--
}
r(1<<i, 0, c,
func(m, x, y *Value) *Value {
return m.Block.NewValue1I(m.Pos, OpLOONG64SLLVconst, m.Type, int64(i), x)
})
}
// ADDshiftLLV
for i := 1; i < 5; i++ {
c := 10
r(1, 1<<i, c,
func(m, x, y *Value) *Value {
return m.Block.NewValue2I(m.Pos, OpLOONG64ADDshiftLLV, m.Type, int64(i), x, y)
})
}
}
c.mulRecipes = map[int64]mulRecipe{}
// Single-instruction recipes.
// The only option for the input value(s) is v.
for _, combo := range linearCombos {
x := combo.a + combo.b
cost := combo.cost
old := c.mulRecipes[x]
if (old.build == nil || cost < old.cost) && cost < mulCost {
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
return combo.build(m, v, v)
}}
}
}
// Two-instruction recipes.
// A: Both of the outer's inputs are from the same single-instruction recipe.
// B: First input is v and the second is from a single-instruction recipe.
// C: Second input is v and the first is from a single-instruction recipe.
// A is slightly preferred because it often needs 1 less register, so it
// goes first.
// A
for _, inner := range linearCombos {
for _, outer := range linearCombos {
x := (inner.a + inner.b) * (outer.a + outer.b)
cost := inner.cost + outer.cost
old := c.mulRecipes[x]
if (old.build == nil || cost < old.cost) && cost < mulCost {
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
v = inner.build(m, v, v)
return outer.build(m, v, v)
}}
}
}
}
// B
for _, inner := range linearCombos {
for _, outer := range linearCombos {
x := outer.a + outer.b*(inner.a+inner.b)
cost := inner.cost + outer.cost
old := c.mulRecipes[x]
if (old.build == nil || cost < old.cost) && cost < mulCost {
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
return outer.build(m, v, inner.build(m, v, v))
}}
}
}
}
// C
for _, inner := range linearCombos {
for _, outer := range linearCombos {
x := outer.a*(inner.a+inner.b) + outer.b
cost := inner.cost + outer.cost
old := c.mulRecipes[x]
if (old.build == nil || cost < old.cost) && cost < mulCost {
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
return outer.build(m, inner.build(m, v, v), v)
}}
}
}
}
// Currently we only process 3 linear combination instructions for loong64.
if arch == "loong64" {
// Three-instruction recipes.
// D: The first and the second are all single-instruction recipes, and they are also the third's inputs.
// E: The first single-instruction is the second's input, and the second is the third's input.
// D
for _, first := range linearCombos {
for _, second := range linearCombos {
for _, third := range linearCombos {
x := third.a*(first.a+first.b) + third.b*(second.a+second.b)
cost := first.cost + second.cost + third.cost
old := c.mulRecipes[x]
if (old.build == nil || cost < old.cost) && cost < mulCost {
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
v1 := first.build(m, v, v)
v2 := second.build(m, v, v)
return third.build(m, v1, v2)
}}
}
}
}
}
// E
for _, first := range linearCombos {
for _, second := range linearCombos {
for _, third := range linearCombos {
x := third.a*(second.a*(first.a+first.b)+second.b) + third.b
cost := first.cost + second.cost + third.cost
old := c.mulRecipes[x]
if (old.build == nil || cost < old.cost) && cost < mulCost {
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
v1 := first.build(m, v, v)
v2 := second.build(m, v1, v)
return third.build(m, v2, v)
}}
}
}
}
}
}
// These cases should be handled specially by rewrite rules.
// (Otherwise v * 1 == (neg (neg v)))
delete(c.mulRecipes, 0)
delete(c.mulRecipes, 1)
// Currently:
// len(c.mulRecipes) == 5984 on arm64
// 680 on amd64
// 9738 on loong64
// This function takes ~2.5ms on arm64.
//println(len(c.mulRecipes))
}