[dev.simd] cmd/compile, simd: remove mask load and stores

We have convert mask to bits already, the API of mask load and stores
are inconsistent with them, also mask load and stores could just be
hidden behind peepholes. So this CL removes them, the next CL will add
the peephole for them.

Change-Id: Ifa7d23fb52bb0efd1785935ead4d703927f16d2b
Reviewed-on: https://go-review.googlesource.com/c/go/+/710915
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Junyang Shao 2025-10-10 17:42:59 +00:00
parent c4fbf3b4cf
commit 2e71cf1a2a
10 changed files with 1 additions and 1014 deletions

View file

@ -1641,41 +1641,6 @@
// SIMD lowering rules
// Mask loads
(LoadMask8x16 <t> ptr mem) => (VPMOVMToVec8x16 <types.TypeVec128> (KMOVQload <t> ptr mem))
(LoadMask8x32 <t> ptr mem) => (VPMOVMToVec8x32 <types.TypeVec256> (KMOVQload <t> ptr mem))
(LoadMask8x64 <t> ptr mem) => (VPMOVMToVec8x64 <types.TypeVec512> (KMOVQload <t> ptr mem))
(LoadMask16x8 <t> ptr mem) => (VPMOVMToVec16x8 <types.TypeVec128> (KMOVQload <t> ptr mem))
(LoadMask16x16 <t> ptr mem) => (VPMOVMToVec16x16 <types.TypeVec256> (KMOVQload <t> ptr mem))
(LoadMask16x32 <t> ptr mem) => (VPMOVMToVec16x32 <types.TypeVec512> (KMOVQload <t> ptr mem))
(LoadMask32x4 <t> ptr mem) => (VPMOVMToVec32x4 <types.TypeVec128> (KMOVQload <t> ptr mem))
(LoadMask32x8 <t> ptr mem) => (VPMOVMToVec32x8 <types.TypeVec256> (KMOVQload <t> ptr mem))
(LoadMask32x16 <t> ptr mem) => (VPMOVMToVec32x16 <types.TypeVec512> (KMOVQload <t> ptr mem))
(LoadMask64x2 <t> ptr mem) => (VPMOVMToVec64x2 <types.TypeVec128> (KMOVQload <t> ptr mem))
(LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
(LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
(StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
(StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
(StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
(StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
(StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
(StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
(StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
(StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
(StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
(StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
(StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
// TODO is this correct? Should we just do it all from 64-bits?
// Mask conversions
// integers to masks
(Cvt16toMask8x16 <t> x) => (VPMOVMToVec8x16 <types.TypeVec128> (KMOVWk <t> x))

View file

@ -677,31 +677,6 @@ var genericOps = []opData{
// SIMD
{name: "ZeroSIMD", argLength: 0}, // zero value of a vector
{name: "LoadMask8x16", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask8x32", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask8x64", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask16x8", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask16x16", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask16x32", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask32x4", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask32x8", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask32x16", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x2", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x4", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x8", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
// Convert integers to masks
{name: "Cvt16toMask8x16", argLength: 1}, // arg0 = integer mask value

View file

@ -5364,30 +5364,6 @@ const (
OpPrefetchCache
OpPrefetchCacheStreamed
OpZeroSIMD
OpLoadMask8x16
OpLoadMask8x32
OpLoadMask8x64
OpLoadMask16x8
OpLoadMask16x16
OpLoadMask16x32
OpLoadMask32x4
OpLoadMask32x8
OpLoadMask32x16
OpLoadMask64x2
OpLoadMask64x4
OpLoadMask64x8
OpStoreMask8x16
OpStoreMask8x32
OpStoreMask8x64
OpStoreMask16x8
OpStoreMask16x16
OpStoreMask16x32
OpStoreMask32x4
OpStoreMask32x8
OpStoreMask32x16
OpStoreMask64x2
OpStoreMask64x4
OpStoreMask64x8
OpCvt16toMask8x16
OpCvt32toMask8x32
OpCvt64toMask8x64
@ -75965,138 +75941,6 @@ var opcodeTable = [...]opInfo{
argLen: 0,
generic: true,
},
{
name: "LoadMask8x16",
argLen: 2,
generic: true,
},
{
name: "LoadMask8x32",
argLen: 2,
generic: true,
},
{
name: "LoadMask8x64",
argLen: 2,
generic: true,
},
{
name: "LoadMask16x8",
argLen: 2,
generic: true,
},
{
name: "LoadMask16x16",
argLen: 2,
generic: true,
},
{
name: "LoadMask16x32",
argLen: 2,
generic: true,
},
{
name: "LoadMask32x4",
argLen: 2,
generic: true,
},
{
name: "LoadMask32x8",
argLen: 2,
generic: true,
},
{
name: "LoadMask32x16",
argLen: 2,
generic: true,
},
{
name: "LoadMask64x2",
argLen: 2,
generic: true,
},
{
name: "LoadMask64x4",
argLen: 2,
generic: true,
},
{
name: "LoadMask64x8",
argLen: 2,
generic: true,
},
{
name: "StoreMask8x16",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask8x32",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask8x64",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask16x8",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask16x16",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask16x32",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask32x4",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask32x8",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask32x16",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask64x2",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask64x4",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "StoreMask64x8",
auxType: auxTyp,
argLen: 3,
generic: true,
},
{
name: "Cvt16toMask8x16",
argLen: 1,

View file

@ -3769,30 +3769,6 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpLessUint8x64(v)
case OpLoad:
return rewriteValueAMD64_OpLoad(v)
case OpLoadMask16x16:
return rewriteValueAMD64_OpLoadMask16x16(v)
case OpLoadMask16x32:
return rewriteValueAMD64_OpLoadMask16x32(v)
case OpLoadMask16x8:
return rewriteValueAMD64_OpLoadMask16x8(v)
case OpLoadMask32x16:
return rewriteValueAMD64_OpLoadMask32x16(v)
case OpLoadMask32x4:
return rewriteValueAMD64_OpLoadMask32x4(v)
case OpLoadMask32x8:
return rewriteValueAMD64_OpLoadMask32x8(v)
case OpLoadMask64x2:
return rewriteValueAMD64_OpLoadMask64x2(v)
case OpLoadMask64x4:
return rewriteValueAMD64_OpLoadMask64x4(v)
case OpLoadMask64x8:
return rewriteValueAMD64_OpLoadMask64x8(v)
case OpLoadMask8x16:
return rewriteValueAMD64_OpLoadMask8x16(v)
case OpLoadMask8x32:
return rewriteValueAMD64_OpLoadMask8x32(v)
case OpLoadMask8x64:
return rewriteValueAMD64_OpLoadMask8x64(v)
case OpLoadMasked16:
return rewriteValueAMD64_OpLoadMasked16(v)
case OpLoadMasked32:
@ -5636,30 +5612,6 @@ func rewriteValueAMD64(v *Value) bool {
return true
case OpStore:
return rewriteValueAMD64_OpStore(v)
case OpStoreMask16x16:
return rewriteValueAMD64_OpStoreMask16x16(v)
case OpStoreMask16x32:
return rewriteValueAMD64_OpStoreMask16x32(v)
case OpStoreMask16x8:
return rewriteValueAMD64_OpStoreMask16x8(v)
case OpStoreMask32x16:
return rewriteValueAMD64_OpStoreMask32x16(v)
case OpStoreMask32x4:
return rewriteValueAMD64_OpStoreMask32x4(v)
case OpStoreMask32x8:
return rewriteValueAMD64_OpStoreMask32x8(v)
case OpStoreMask64x2:
return rewriteValueAMD64_OpStoreMask64x2(v)
case OpStoreMask64x4:
return rewriteValueAMD64_OpStoreMask64x4(v)
case OpStoreMask64x8:
return rewriteValueAMD64_OpStoreMask64x8(v)
case OpStoreMask8x16:
return rewriteValueAMD64_OpStoreMask8x16(v)
case OpStoreMask8x32:
return rewriteValueAMD64_OpStoreMask8x32(v)
case OpStoreMask8x64:
return rewriteValueAMD64_OpStoreMask8x64(v)
case OpStoreMasked16:
return rewriteValueAMD64_OpStoreMasked16(v)
case OpStoreMasked32:
@ -54997,222 +54949,6 @@ func rewriteValueAMD64_OpLoad(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpLoadMask16x16(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask16x16 <t> ptr mem)
// result: (VPMOVMToVec16x16 <types.TypeVec256> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec16x16)
v.Type = types.TypeVec256
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask16x32(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask16x32 <t> ptr mem)
// result: (VPMOVMToVec16x32 <types.TypeVec512> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec16x32)
v.Type = types.TypeVec512
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask16x8(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask16x8 <t> ptr mem)
// result: (VPMOVMToVec16x8 <types.TypeVec128> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec16x8)
v.Type = types.TypeVec128
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask32x16(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask32x16 <t> ptr mem)
// result: (VPMOVMToVec32x16 <types.TypeVec512> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec32x16)
v.Type = types.TypeVec512
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask32x4(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask32x4 <t> ptr mem)
// result: (VPMOVMToVec32x4 <types.TypeVec128> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec32x4)
v.Type = types.TypeVec128
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask32x8(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask32x8 <t> ptr mem)
// result: (VPMOVMToVec32x8 <types.TypeVec256> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec32x8)
v.Type = types.TypeVec256
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask64x2(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask64x2 <t> ptr mem)
// result: (VPMOVMToVec64x2 <types.TypeVec128> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec64x2)
v.Type = types.TypeVec128
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask64x4(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask64x4 <t> ptr mem)
// result: (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec64x4)
v.Type = types.TypeVec256
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask64x8(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask64x8 <t> ptr mem)
// result: (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec64x8)
v.Type = types.TypeVec512
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask8x16(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask8x16 <t> ptr mem)
// result: (VPMOVMToVec8x16 <types.TypeVec128> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec8x16)
v.Type = types.TypeVec128
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask8x32(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask8x32 <t> ptr mem)
// result: (VPMOVMToVec8x32 <types.TypeVec256> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec8x32)
v.Type = types.TypeVec256
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (LoadMask8x64 <t> ptr mem)
// result: (VPMOVMToVec8x64 <types.TypeVec512> (KMOVQload <t> ptr mem))
for {
t := v.Type
ptr := v_0
mem := v_1
v.reset(OpAMD64VPMOVMToVec8x64)
v.Type = types.TypeVec512
v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
v0.AddArg2(ptr, mem)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLoadMasked16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -59830,234 +59566,6 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x32 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x4 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x2 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x4 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x32 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x64 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMasked16(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]

View file

@ -2024,13 +2024,6 @@ func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
}
}
var loadMaskOpcodes = map[int]map[int]ssa.Op{
8: {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64},
16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32},
32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16},
64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8},
}
var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
8: {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
@ -2045,33 +2038,6 @@ var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
}
func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
op := loadMaskOpcodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
return s.newValue2(op, types.TypeMask, args[0], s.mem())
}
}
func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
opCodes := map[int]map[int]ssa.Op{
8: {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
}
op := opCodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
return nil
}
}
func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
op := cvtVToMaskOpcodes[elemBits][lanes]

View file

@ -1685,96 +1685,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Int8x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x16FromBits", simdCvtVToMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x16.ToBits", simdCvtMaskToV(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x32.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x32FromBits", simdCvtVToMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x32.ToBits", simdCvtMaskToV(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x64.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask8x64FromBits", simdCvtVToMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask8x64.ToBits", simdCvtMaskToV(8, 64), sys.AMD64)
addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x8FromBits", simdCvtVToMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x8.ToBits", simdCvtMaskToV(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x16FromBits", simdCvtVToMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x16.ToBits", simdCvtMaskToV(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x32.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask16x32FromBits", simdCvtVToMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask16x32.ToBits", simdCvtMaskToV(16, 32), sys.AMD64)
addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x4.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x4FromBits", simdCvtVToMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x4.ToBits", simdCvtMaskToV(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x8FromBits", simdCvtVToMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x8.ToBits", simdCvtMaskToV(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask32x16FromBits", simdCvtVToMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask32x16.ToBits", simdCvtMaskToV(32, 16), sys.AMD64)
addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x2.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x2FromBits", simdCvtVToMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x2.ToBits", simdCvtMaskToV(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x4.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x4FromBits", simdCvtVToMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x4.ToBits", simdCvtMaskToV(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8FromBits", simdCvtVToMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8.ToBits", simdCvtMaskToV(64, 8), sys.AMD64)
}

View file

@ -80,8 +80,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "{{.VectorCounterpart}}.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "{{.Name}}.And", opLen2(ssa.OpAnd{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
addF(simdPackage, "{{.Name}}.Or", opLen2(ssa.OpOr{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
addF(simdPackage, "Load{{.Name}}FromBits", simdLoadMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
addF(simdPackage, "{{.Name}}.StoreToBits", simdStoreMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
addF(simdPackage, "{{.Name}}FromBits", simdCvtVToMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
addF(simdPackage, "{{.Name}}.ToBits", simdCvtMaskToV({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
{{end}}

View file

@ -180,22 +180,6 @@ func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}
func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
`
const simdMaskFromBitsTemplate = `
// Load{{.Name}}FromBits constructs a {{.Name}} from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower {{.Lanes}} bits of y are used.
//
// CPU Features: AVX512
//go:noescape
func Load{{.Name}}FromBits(y *uint64) {{.Name}}
// StoreToBits stores a {{.Name}} as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower {{.Lanes}} bits of y are used.
//
// CPU Features: AVX512
//go:noescape
func (x {{.Name}}) StoreToBits(y *uint64)
`
const simdMaskFromValTemplate = `
// {{.Name}}FromBits constructs a {{.Name}} from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower {{.Lanes}} bits of y are used.
@ -503,7 +487,6 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
t := templateOf(simdTypesTemplates, "types_amd64")
loadStore := templateOf(simdLoadStoreTemplate, "loadstore_amd64")
maskedLoadStore := templateOf(simdMaskedLoadStoreTemplate, "maskedloadstore_amd64")
maskFromBits := templateOf(simdMaskFromBitsTemplate, "maskFromBits_amd64")
maskFromVal := templateOf(simdMaskFromValTemplate, "maskFromVal_amd64")
buffer := new(bytes.Buffer)
@ -542,9 +525,6 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
}
}
} else {
if err := maskFromBits.ExecuteTemplate(buffer, "maskFromBits_amd64", typeDef); err != nil {
panic(fmt.Errorf("failed to execute maskFromBits template for type %s: %w", typeDef.Name, err))
}
if err := maskFromVal.ExecuteTemplate(buffer, "maskFromVal_amd64", typeDef); err != nil {
panic(fmt.Errorf("failed to execute maskFromVal template for type %s: %w", typeDef.Name, err))
}

View file

@ -332,39 +332,6 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si
}
}
func TestBitMaskLoad(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
var bits uint64 = 0b10
results := [2]int64{}
want := [2]int64{0, 6}
m := simd.LoadMask64x2FromBits(&bits)
simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
for i := range 2 {
if results[i] != want[i] {
t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
}
}
}
func TestBitMaskStore(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
var want uint64 = 0b101
var got uint64
x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0})
m := y.Greater(x)
m.StoreToBits(&got)
if got != want {
t.Errorf("Result incorrect: want %b, got %b", want, got)
}
}
func TestBitMaskFromBits(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")

View file

@ -301,22 +301,6 @@ type Mask8x16 struct {
vals [16]int8
}
// LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask8x16FromBits(y *uint64) Mask8x16
// StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x16) StoreToBits(y *uint64)
// Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
@ -335,22 +319,6 @@ type Mask16x8 struct {
vals [8]int16
}
// LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask16x8FromBits(y *uint64) Mask16x8
// StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x8) StoreToBits(y *uint64)
// Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
@ -369,22 +337,6 @@ type Mask32x4 struct {
vals [4]int32
}
// LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask32x4FromBits(y *uint64) Mask32x4
// StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x4) StoreToBits(y *uint64)
// Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
@ -403,22 +355,6 @@ type Mask64x2 struct {
vals [2]int64
}
// LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask64x2FromBits(y *uint64) Mask64x2
// StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x2) StoreToBits(y *uint64)
// Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
@ -728,22 +664,6 @@ type Mask8x32 struct {
vals [32]int8
}
// LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask8x32FromBits(y *uint64) Mask8x32
// StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x32) StoreToBits(y *uint64)
// Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
@ -762,22 +682,6 @@ type Mask16x16 struct {
vals [16]int16
}
// LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask16x16FromBits(y *uint64) Mask16x16
// StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x16) StoreToBits(y *uint64)
// Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
@ -796,22 +700,6 @@ type Mask32x8 struct {
vals [8]int32
}
// LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask32x8FromBits(y *uint64) Mask32x8
// StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x8) StoreToBits(y *uint64)
// Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
@ -830,22 +718,6 @@ type Mask64x4 struct {
vals [4]int64
}
// LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask64x4FromBits(y *uint64) Mask64x4
// StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x4) StoreToBits(y *uint64)
// Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
@ -1219,22 +1091,6 @@ type Mask8x64 struct {
vals [64]int8
}
// LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask8x64FromBits(y *uint64) Mask8x64
// StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x64) StoreToBits(y *uint64)
// Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
@ -1253,22 +1109,6 @@ type Mask16x32 struct {
vals [32]int16
}
// LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask16x32FromBits(y *uint64) Mask16x32
// StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x32) StoreToBits(y *uint64)
// Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
@ -1287,22 +1127,6 @@ type Mask32x16 struct {
vals [16]int32
}
// LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask32x16FromBits(y *uint64) Mask32x16
// StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x16) StoreToBits(y *uint64)
// Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
@ -1321,22 +1145,6 @@ type Mask64x8 struct {
vals [8]int64
}
// LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask64x8FromBits(y *uint64) Mask64x8
// StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x8) StoreToBits(y *uint64)
// Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//