[dev.simd] cmd/compile: optimize VPTEST for 2-operand cases

Change-Id: Ica2d5ee48082c69e86b12b519ba8df7a2556392f
Reviewed-on: https://go-review.googlesource.com/c/go/+/704355
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Junyang Shao 2025-09-16 17:27:36 +00:00
parent f1e3651c33
commit e34ad6de42
3 changed files with 407 additions and 0 deletions

View file

@ -1802,3 +1802,13 @@
(VMOVDQUstore(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUstore(128|256|512) [off1+off2] {sym} ptr val mem) (VMOVDQUstore(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUstore(128|256|512) [off1+off2] {sym} ptr val mem)
(VMOVDQUload(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUload(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base mem) (VMOVDQUload(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUload(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base mem)
(VMOVDQUstore(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUstore(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base val mem) (VMOVDQUstore(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUstore(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base val mem)
// 2-op VPTEST optimizations
(SETEQ (VPTEST x:(VPAND(128|256) j k) y)) && x == y && x.Uses == 2 => (SETEQ (VPTEST j k))
(SETEQ (VPTEST x:(VPAND(D|Q)512 j k) y)) && x == y && x.Uses == 2 => (SETEQ (VPTEST j k))
(SETEQ (VPTEST x:(VPANDN(128|256) j k) y)) && x == y && x.Uses == 2 => (SETB (VPTEST k j)) // AndNot has swapped its operand order
(SETEQ (VPTEST x:(VPANDN(D|Q)512 j k) y)) && x == y && x.Uses == 2 => (SETB (VPTEST k j)) // AndNot has swapped its operand order
(EQ (VPTEST x:(VPAND(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (EQ (VPTEST j k) yes no)
(EQ (VPTEST x:(VPAND(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (EQ (VPTEST j k) yes no)
(EQ (VPTEST x:(VPANDN(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order
(EQ (VPTEST x:(VPANDN(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order

View file

@ -22607,6 +22607,190 @@ func rewriteValueAMD64_OpAMD64SETEQ(v *Value) bool {
} }
break break
} }
// match: (SETEQ (VPTEST x:(VPAND128 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETEQ (VPTEST j k))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPAND128 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETEQ)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPAND256 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETEQ (VPTEST j k))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPAND256 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETEQ)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPANDD512 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETEQ (VPTEST j k))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDD512 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETEQ)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPANDQ512 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETEQ (VPTEST j k))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDQ512 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETEQ)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPANDN128 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETB (VPTEST k j))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDN128 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETB)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPANDN256 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETB (VPTEST k j))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDN256 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETB)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPANDND512 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETB (VPTEST k j))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDND512 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETB)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
v.AddArg(v0)
return true
}
// match: (SETEQ (VPTEST x:(VPANDNQ512 j k) y))
// cond: x == y && x.Uses == 2
// result: (SETB (VPTEST k j))
for {
if v_0.Op != OpAMD64VPTEST {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDNQ512 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v.reset(OpAMD64SETB)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
v.AddArg(v0)
return true
}
return false return false
} }
func rewriteValueAMD64_OpAMD64SETEQstore(v *Value) bool { func rewriteValueAMD64_OpAMD64SETEQstore(v *Value) bool {
@ -61066,6 +61250,190 @@ func rewriteBlockAMD64(b *Block) bool {
} }
break break
} }
// match: (EQ (VPTEST x:(VPAND128 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (EQ (VPTEST j k) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPAND128 {
break
}
_ = x.Args[1]
x_0 := x.Args[0]
x_1 := x.Args[1]
for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 {
j := x_0
k := x_1
if !(x == y && x.Uses == 2) {
continue
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
b.resetWithControl(BlockAMD64EQ, v0)
return true
}
break
}
// match: (EQ (VPTEST x:(VPAND256 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (EQ (VPTEST j k) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPAND256 {
break
}
_ = x.Args[1]
x_0 := x.Args[0]
x_1 := x.Args[1]
for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 {
j := x_0
k := x_1
if !(x == y && x.Uses == 2) {
continue
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
b.resetWithControl(BlockAMD64EQ, v0)
return true
}
break
}
// match: (EQ (VPTEST x:(VPANDD512 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (EQ (VPTEST j k) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDD512 {
break
}
_ = x.Args[1]
x_0 := x.Args[0]
x_1 := x.Args[1]
for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 {
j := x_0
k := x_1
if !(x == y && x.Uses == 2) {
continue
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
b.resetWithControl(BlockAMD64EQ, v0)
return true
}
break
}
// match: (EQ (VPTEST x:(VPANDQ512 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (EQ (VPTEST j k) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDQ512 {
break
}
_ = x.Args[1]
x_0 := x.Args[0]
x_1 := x.Args[1]
for _i0 := 0; _i0 <= 1; _i0, x_0, x_1 = _i0+1, x_1, x_0 {
j := x_0
k := x_1
if !(x == y && x.Uses == 2) {
continue
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(j, k)
b.resetWithControl(BlockAMD64EQ, v0)
return true
}
break
}
// match: (EQ (VPTEST x:(VPANDN128 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (ULT (VPTEST k j) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDN128 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
b.resetWithControl(BlockAMD64ULT, v0)
return true
}
// match: (EQ (VPTEST x:(VPANDN256 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (ULT (VPTEST k j) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDN256 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
b.resetWithControl(BlockAMD64ULT, v0)
return true
}
// match: (EQ (VPTEST x:(VPANDND512 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (ULT (VPTEST k j) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDND512 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
b.resetWithControl(BlockAMD64ULT, v0)
return true
}
// match: (EQ (VPTEST x:(VPANDNQ512 j k) y) yes no)
// cond: x == y && x.Uses == 2
// result: (ULT (VPTEST k j) yes no)
for b.Controls[0].Op == OpAMD64VPTEST {
v_0 := b.Controls[0]
y := v_0.Args[1]
x := v_0.Args[0]
if x.Op != OpAMD64VPANDNQ512 {
break
}
k := x.Args[1]
j := x.Args[0]
if !(x == y && x.Uses == 2) {
break
}
v0 := b.NewValue0(v_0.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(k, j)
b.resetWithControl(BlockAMD64ULT, v0)
return true
}
case BlockAMD64GE: case BlockAMD64GE:
// match: (GE c:(CMPQconst [128] z) yes no) // match: (GE c:(CMPQconst [128] z) yes no)
// cond: c.Uses == 1 // cond: c.Uses == 1

29
test/codegen/simd.go Normal file
View file

@ -0,0 +1,29 @@
// asmcheck
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// These tests check code generation of simd peephole optimizations.
//go:build goexperiment.simd
package codegen
import "simd"
func vptest1() bool {
v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
// amd64:`VPTEST\s(.*)(.*)$`
// amd64:`SETCS\s(.*)$`
return v1.AndNot(v2).IsZero()
}
func vptest2() bool {
v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
// amd64:`VPTEST\s(.*)(.*)$`
// amd64:`SETEQ\s(.*)$`
return v1.And(v2).IsZero()
}