[dev.simd] cmd/compile: fix holes in mask peepholes

It turns out that ".Masked" is implemented by VPANDQ *and* VPANDD.
The shape of bitwise AND doesn't matter, the correctness of the rules is
guaranteed by the way the mask is generated.

This CL fix the holes in the peephole rules.

Change-Id: I2d15c4d17afed6fdbb2f3905a51b2c5c2f673348
Reviewed-on: https://go-review.googlesource.com/c/go/+/703257
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Junyang Shao 2025-09-14 20:17:55 +00:00
parent 3ec0b25ab7
commit dabe2bb4fb
2 changed files with 64 additions and 0 deletions

View file

@ -1768,6 +1768,10 @@
(VPANDQ512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k) (VPANDQ512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k)
(VPANDQ512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k) (VPANDQ512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k)
(VPANDQ512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k) (VPANDQ512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k)
(VPANDD512 x (VPMOVMToVec64x8 k)) => (VMOVDQU64Masked512 x k)
(VPANDD512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k)
(VPANDD512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k)
(VPANDD512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k)
// Insert to zero of 32/64 bit floats and ints to a zero is just MOVS[SD] // Insert to zero of 32/64 bit floats and ints to a zero is just MOVS[SD]
(VPINSRQ128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSDf2v <types.TypeVec128> y) (VPINSRQ128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSDf2v <types.TypeVec128> y)

View file

@ -34681,6 +34681,66 @@ func rewriteValueAMD64_OpAMD64VPADDQMasked512(v *Value) bool {
func rewriteValueAMD64_OpAMD64VPANDD512(v *Value) bool { func rewriteValueAMD64_OpAMD64VPANDD512(v *Value) bool {
v_1 := v.Args[1] v_1 := v.Args[1]
v_0 := v.Args[0] v_0 := v.Args[0]
// match: (VPANDD512 x (VPMOVMToVec64x8 k))
// result: (VMOVDQU64Masked512 x k)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64VPMOVMToVec64x8 {
continue
}
k := v_1.Args[0]
v.reset(OpAMD64VMOVDQU64Masked512)
v.AddArg2(x, k)
return true
}
break
}
// match: (VPANDD512 x (VPMOVMToVec32x16 k))
// result: (VMOVDQU32Masked512 x k)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64VPMOVMToVec32x16 {
continue
}
k := v_1.Args[0]
v.reset(OpAMD64VMOVDQU32Masked512)
v.AddArg2(x, k)
return true
}
break
}
// match: (VPANDD512 x (VPMOVMToVec16x32 k))
// result: (VMOVDQU16Masked512 x k)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64VPMOVMToVec16x32 {
continue
}
k := v_1.Args[0]
v.reset(OpAMD64VMOVDQU16Masked512)
v.AddArg2(x, k)
return true
}
break
}
// match: (VPANDD512 x (VPMOVMToVec8x64 k))
// result: (VMOVDQU8Masked512 x k)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64VPMOVMToVec8x64 {
continue
}
k := v_1.Args[0]
v.reset(OpAMD64VMOVDQU8Masked512)
v.AddArg2(x, k)
return true
}
break
}
// match: (VPANDD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) // match: (VPANDD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
// cond: canMergeLoad(v, l) && clobber(l) // cond: canMergeLoad(v, l) && clobber(l)
// result: (VPANDD512load {sym} [off] x ptr mem) // result: (VPANDD512load {sym} [off] x ptr mem)