cmd/compile: instrinsify TrailingZeros{8,32,64} for 386

This CL add support for instrinsifying the TrialingZeros{8,32,64}
functions for 386 architecture. We need handle the case when the input
is 0, which could lead to undefined output from the BSFL instruction.

Next CL will remove the assembly code in runtime/internal/sys package.

Change-Id: Ic168edf68e81bf69a536102100fdd3f56f0f4a1b
Reviewed-on: https://go-review.googlesource.com/c/go/+/475735
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: Wayne Zuo <wdvxdr@golangcn.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Wayne Zuo 2023-03-12 15:34:20 +08:00
parent 82bf12902f
commit cedfcba3e8
7 changed files with 77 additions and 3 deletions

View file

@ -56,8 +56,12 @@
(Sqrt ...) => (SQRTSD ...)
(Sqrt32 ...) => (SQRTSS ...)
(Ctz8 x) => (BSFL (ORLconst <typ.UInt32> [0x100] x))
(Ctz8NonZero ...) => (BSFL ...)
(Ctz16 x) => (BSFL (ORLconst <typ.UInt32> [0x10000] x))
(Ctz16NonZero ...) => (BSFL ...)
(Ctz32 ...) => (LoweredCtz32 ...)
(Ctz32NonZero ...) => (BSFL ...)
// Lowering extension
(SignExt8to16 ...) => (MOVBLSX ...)

View file

@ -302,6 +302,7 @@ func init() {
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
{name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes
{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero

View file

@ -456,6 +456,7 @@ const (
Op386NOTL
Op386BSFL
Op386BSFW
Op386LoweredCtz32
Op386BSRL
Op386BSRW
Op386BSWAPL
@ -5034,6 +5035,20 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredCtz32",
argLen: 1,
clobberFlags: true,
asm: x86.ABSFL,
reg: regInfo{
inputs: []inputInfo{
{0, 239}, // AX CX DX BX BP SI DI
},
outputs: []outputInfo{
{0, 239}, // AX CX DX BX BP SI DI
},
},
},
{
name: "BSRL",
argLen: 1,

View file

@ -315,6 +315,17 @@ func rewriteValue386(v *Value) bool {
case OpCtz16NonZero:
v.Op = Op386BSFL
return true
case OpCtz32:
v.Op = Op386LoweredCtz32
return true
case OpCtz32NonZero:
v.Op = Op386BSFL
return true
case OpCtz8:
return rewriteValue386_OpCtz8(v)
case OpCtz8NonZero:
v.Op = Op386BSFL
return true
case OpCvt32Fto32:
v.Op = Op386CVTTSS2SL
return true
@ -8527,6 +8538,22 @@ func rewriteValue386_OpCtz16(v *Value) bool {
return true
}
}
func rewriteValue386_OpCtz8(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (Ctz8 x)
// result: (BSFL (ORLconst <typ.UInt32> [0x100] x))
for {
x := v_0
v.reset(Op386BSFL)
v0 := b.NewValue0(v.Pos, Op386ORLconst, typ.UInt32)
v0.AuxInt = int32ToAuxInt(0x100)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValue386_OpDiv8(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]

View file

@ -4492,12 +4492,12 @@ func InitTables() {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
@ -4531,7 +4531,7 @@ func InitTables() {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Wasm)
sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
addF("math/bits", "TrailingZeros8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])

View file

@ -831,6 +831,29 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
base.WarnfAt(v.Pos, "generated nil check")
}
case ssa.Op386LoweredCtz32:
// BSFL in, out
p := s.Prog(x86.ABSFL)
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
// JNZ 2(PC)
p1 := s.Prog(x86.AJNE)
p1.To.Type = obj.TYPE_BRANCH
// MOVL $32, out
p2 := s.Prog(x86.AMOVL)
p2.From.Type = obj.TYPE_CONST
p2.From.Offset = 32
p2.To.Type = obj.TYPE_REG
p2.To.Reg = v.Reg()
// NOP (so the JNZ has somewhere to land)
nop := s.Prog(obj.ANOP)
p1.To.SetTarget(nop)
case ssa.OpClobber:
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST

View file

@ -293,6 +293,7 @@ func RotateLeftVariable32(n uint32, m int) uint32 {
func TrailingZeros(n uint) int {
// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
// amd64/v3:"TZCNTQ"
// 386:"BSFL"
// arm:"CLZ"
// arm64:"RBIT","CLZ"
// s390x:"FLOGR"
@ -305,6 +306,7 @@ func TrailingZeros(n uint) int {
func TrailingZeros64(n uint64) int {
// amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
// amd64/v3:"TZCNTQ"
// 386:"BSFL"
// arm64:"RBIT","CLZ"
// s390x:"FLOGR"
// ppc64x/power8:"ANDN","POPCNTD"
@ -322,6 +324,7 @@ func TrailingZeros64Subtract(n uint64) int {
func TrailingZeros32(n uint32) int {
// amd64/v1,amd64/v2:"BTSQ\\t\\$32","BSFQ"
// amd64/v3:"TZCNTL"
// 386:"BSFL"
// arm:"CLZ"
// arm64:"RBITW","CLZW"
// s390x:"FLOGR","MOVWZ"
@ -345,6 +348,7 @@ func TrailingZeros16(n uint16) int {
func TrailingZeros8(n uint8) int {
// amd64:"BSFL","BTSL\\t\\$8"
// 386:"BSFL"
// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
// s390x:"FLOGR","OR\t\\$256"