[dev.simd] cmd/compile, simd: remove mask load and stores

We have convert mask to bits already, the API of mask load and stores are inconsistent with them, also mask load and stores could just be hidden behind peepholes. So this CL removes them, the next CL will add the peephole for them. Change-Id: Ifa7d23fb52bb0efd1785935ead4d703927f16d2b Reviewed-on: https://go-review.googlesource.com/c/go/+/710915 Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-12-08 06:10:04 +00:00 · 2025-10-10 17:42:59 +00:00 · 2025-10-10 17:42:59 +00:00 · 2e71cf1a2a
commit 2e71cf1a2a
parent c4fbf3b4cf
10 changed files with 1 additions and 1014 deletions
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@ -1641,41 +1641,6 @@

 // SIMD lowering rules

-// Mask loads
-(LoadMask8x16 <t> ptr mem) => (VPMOVMToVec8x16 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask8x32 <t> ptr mem) => (VPMOVMToVec8x32 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask8x64 <t> ptr mem) => (VPMOVMToVec8x64 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(LoadMask16x8 <t> ptr mem) => (VPMOVMToVec16x8 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask16x16 <t> ptr mem) => (VPMOVMToVec16x16 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask16x32 <t> ptr mem) => (VPMOVMToVec16x32 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(LoadMask32x4 <t> ptr mem) => (VPMOVMToVec32x4 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask32x8 <t> ptr mem) => (VPMOVMToVec32x8 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask32x16 <t> ptr mem) => (VPMOVMToVec32x16 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(LoadMask64x2 <t> ptr mem) => (VPMOVMToVec64x2 <types.TypeVec128> (KMOVQload <t> ptr mem))
-(LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
-(LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
-
-(StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
-(StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
-(StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
-
-(StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
-(StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
-(StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
-
-(StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
-(StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
-(StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
-
-(StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
-(StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
-(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
-
-// TODO is this correct?  Should we just do it all from 64-bits?
-
 // Mask conversions
 // integers to masks
 (Cvt16toMask8x16 <t> x) => (VPMOVMToVec8x16 <types.TypeVec128> (KMOVWk <t> x))
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@ -677,31 +677,6 @@ var genericOps = []opData{

 	// SIMD
 	{name: "ZeroSIMD", argLength: 0}, // zero value of a vector
-	{name: "LoadMask8x16", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask8x32", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask8x64", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask16x8", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask16x16", argLength: 2}, // arg0 = ptr, arg1 = mem
-	{name: "LoadMask16x32", argLength: 2}, // arg0 = ptr, arg1 = mem
-	{name: "LoadMask32x4", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask32x8", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask32x16", argLength: 2}, // arg0 = ptr, arg1 = mem
-	{name: "LoadMask64x2", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask64x4", argLength: 2},  // arg0 = ptr, arg1 = mem
-	{name: "LoadMask64x8", argLength: 2},  // arg0 = ptr, arg1 = mem
-
-	{name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
-	{name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.

 	// Convert integers to masks
 	{name: "Cvt16toMask8x16", argLength: 1},  // arg0 = integer mask value
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -5364,30 +5364,6 @@ const (
 	OpPrefetchCache
 	OpPrefetchCacheStreamed
 	OpZeroSIMD
-	OpLoadMask8x16
-	OpLoadMask8x32
-	OpLoadMask8x64
-	OpLoadMask16x8
-	OpLoadMask16x16
-	OpLoadMask16x32
-	OpLoadMask32x4
-	OpLoadMask32x8
-	OpLoadMask32x16
-	OpLoadMask64x2
-	OpLoadMask64x4
-	OpLoadMask64x8
-	OpStoreMask8x16
-	OpStoreMask8x32
-	OpStoreMask8x64
-	OpStoreMask16x8
-	OpStoreMask16x16
-	OpStoreMask16x32
-	OpStoreMask32x4
-	OpStoreMask32x8
-	OpStoreMask32x16
-	OpStoreMask64x2
-	OpStoreMask64x4
-	OpStoreMask64x8
 	OpCvt16toMask8x16
 	OpCvt32toMask8x32
 	OpCvt64toMask8x64
@ -75965,138 +75941,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  0,
 		generic: true,
 	},
-	{
-		name:    "LoadMask8x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask8x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask8x64",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask16x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask32x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "LoadMask64x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "StoreMask8x16",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask8x32",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask8x64",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask16x8",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask16x16",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask16x32",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask32x4",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask32x8",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask32x16",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask64x2",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask64x4",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "StoreMask64x8",
-		auxType: auxTyp,
-		argLen:  3,
-		generic: true,
-	},
 	{
 		name:    "Cvt16toMask8x16",
 		argLen:  1,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -3769,30 +3769,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpLessUint8x64(v)
 	case OpLoad:
 		return rewriteValueAMD64_OpLoad(v)
-	case OpLoadMask16x16:
-		return rewriteValueAMD64_OpLoadMask16x16(v)
-	case OpLoadMask16x32:
-		return rewriteValueAMD64_OpLoadMask16x32(v)
-	case OpLoadMask16x8:
-		return rewriteValueAMD64_OpLoadMask16x8(v)
-	case OpLoadMask32x16:
-		return rewriteValueAMD64_OpLoadMask32x16(v)
-	case OpLoadMask32x4:
-		return rewriteValueAMD64_OpLoadMask32x4(v)
-	case OpLoadMask32x8:
-		return rewriteValueAMD64_OpLoadMask32x8(v)
-	case OpLoadMask64x2:
-		return rewriteValueAMD64_OpLoadMask64x2(v)
-	case OpLoadMask64x4:
-		return rewriteValueAMD64_OpLoadMask64x4(v)
-	case OpLoadMask64x8:
-		return rewriteValueAMD64_OpLoadMask64x8(v)
-	case OpLoadMask8x16:
-		return rewriteValueAMD64_OpLoadMask8x16(v)
-	case OpLoadMask8x32:
-		return rewriteValueAMD64_OpLoadMask8x32(v)
-	case OpLoadMask8x64:
-		return rewriteValueAMD64_OpLoadMask8x64(v)
 	case OpLoadMasked16:
 		return rewriteValueAMD64_OpLoadMasked16(v)
 	case OpLoadMasked32:
@ -5636,30 +5612,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return true
 	case OpStore:
 		return rewriteValueAMD64_OpStore(v)
-	case OpStoreMask16x16:
-		return rewriteValueAMD64_OpStoreMask16x16(v)
-	case OpStoreMask16x32:
-		return rewriteValueAMD64_OpStoreMask16x32(v)
-	case OpStoreMask16x8:
-		return rewriteValueAMD64_OpStoreMask16x8(v)
-	case OpStoreMask32x16:
-		return rewriteValueAMD64_OpStoreMask32x16(v)
-	case OpStoreMask32x4:
-		return rewriteValueAMD64_OpStoreMask32x4(v)
-	case OpStoreMask32x8:
-		return rewriteValueAMD64_OpStoreMask32x8(v)
-	case OpStoreMask64x2:
-		return rewriteValueAMD64_OpStoreMask64x2(v)
-	case OpStoreMask64x4:
-		return rewriteValueAMD64_OpStoreMask64x4(v)
-	case OpStoreMask64x8:
-		return rewriteValueAMD64_OpStoreMask64x8(v)
-	case OpStoreMask8x16:
-		return rewriteValueAMD64_OpStoreMask8x16(v)
-	case OpStoreMask8x32:
-		return rewriteValueAMD64_OpStoreMask8x32(v)
-	case OpStoreMask8x64:
-		return rewriteValueAMD64_OpStoreMask8x64(v)
 	case OpStoreMasked16:
 		return rewriteValueAMD64_OpStoreMasked16(v)
 	case OpStoreMasked32:
@ -54997,222 +54949,6 @@ func rewriteValueAMD64_OpLoad(v *Value) bool {
 	}
 	return false
 }
-func rewriteValueAMD64_OpLoadMask16x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask16x16 <t> ptr mem)
-	// result: (VPMOVMToVec16x16 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec16x16)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask16x32(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask16x32 <t> ptr mem)
-	// result: (VPMOVMToVec16x32 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec16x32)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask16x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask16x8 <t> ptr mem)
-	// result: (VPMOVMToVec16x8 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec16x8)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask32x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask32x16 <t> ptr mem)
-	// result: (VPMOVMToVec32x16 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec32x16)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask32x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask32x4 <t> ptr mem)
-	// result: (VPMOVMToVec32x4 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec32x4)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask32x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask32x8 <t> ptr mem)
-	// result: (VPMOVMToVec32x8 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec32x8)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask64x2(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask64x2 <t> ptr mem)
-	// result: (VPMOVMToVec64x2 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec64x2)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask64x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask64x4 <t> ptr mem)
-	// result: (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec64x4)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask64x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask64x8 <t> ptr mem)
-	// result: (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec64x8)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask8x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask8x16 <t> ptr mem)
-	// result: (VPMOVMToVec8x16 <types.TypeVec128> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec8x16)
-		v.Type = types.TypeVec128
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask8x32(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask8x32 <t> ptr mem)
-	// result: (VPMOVMToVec8x32 <types.TypeVec256> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec8x32)
-		v.Type = types.TypeVec256
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (LoadMask8x64 <t> ptr mem)
-	// result: (VPMOVMToVec8x64 <types.TypeVec512> (KMOVQload <t> ptr mem))
-	for {
-		t := v.Type
-		ptr := v_0
-		mem := v_1
-		v.reset(OpAMD64VPMOVMToVec8x64)
-		v.Type = types.TypeVec512
-		v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t)
-		v0.AddArg2(ptr, mem)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpLoadMasked16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@ -59830,234 +59566,6 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
 	}
 	return false
 }
-func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask16x16 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask16x32 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask16x8 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask32x16 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask32x4 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask32x8 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask64x2 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask64x4 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask64x8 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask8x16 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask8x32 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
-func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (StoreMask8x64 {t} ptr val mem)
-	// result: (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
-	for {
-		t := auxToType(v.Aux)
-		ptr := v_0
-		val := v_1
-		mem := v_2
-		v.reset(OpAMD64KMOVQstore)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t)
-		v0.AddArg(val)
-		v.AddArg3(ptr, v0, mem)
-		return true
-	}
-}
 func rewriteValueAMD64_OpStoreMasked16(v *Value) bool {
 	v_3 := v.Args[3]
 	v_2 := v.Args[2]
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@ -2024,13 +2024,6 @@ func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	}
 }

-var loadMaskOpcodes = map[int]map[int]ssa.Op{
-	8:  {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64},
-	16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32},
-	32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16},
-	64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8},
-}
-
 var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
 	8:  {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
 	16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
@ -2045,33 +2038,6 @@ var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
 	64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
 }

-func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		op := loadMaskOpcodes[elemBits][lanes]
-		if op == 0 {
-			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
-		}
-		return s.newValue2(op, types.TypeMask, args[0], s.mem())
-	}
-}
-
-func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		opCodes := map[int]map[int]ssa.Op{
-			8:  {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
-			16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
-			32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
-			64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
-		}
-		op := opCodes[elemBits][lanes]
-		if op == 0 {
-			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
-		}
-		s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
-		return nil
-	}
-}
-
 func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		op := cvtVToMaskOpcodes[elemBits][lanes]
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -1685,96 +1685,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int8x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64)
-	addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x16FromBits", simdCvtVToMask(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x16.ToBits", simdCvtMaskToV(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x32.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64)
-	addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x32FromBits", simdCvtVToMask(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x32.ToBits", simdCvtMaskToV(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x64.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64)
-	addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask8x64FromBits", simdCvtVToMask(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask8x64.ToBits", simdCvtMaskToV(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64)
-	addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x8FromBits", simdCvtVToMask(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x8.ToBits", simdCvtMaskToV(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64)
-	addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x16FromBits", simdCvtVToMask(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x16.ToBits", simdCvtMaskToV(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x32.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64)
-	addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask16x32FromBits", simdCvtVToMask(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask16x32.ToBits", simdCvtMaskToV(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x4.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64)
-	addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x4FromBits", simdCvtVToMask(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x4.ToBits", simdCvtMaskToV(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64)
-	addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x8FromBits", simdCvtVToMask(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x8.ToBits", simdCvtMaskToV(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x16.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64)
-	addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask32x16FromBits", simdCvtVToMask(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask32x16.ToBits", simdCvtMaskToV(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x2.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64)
-	addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x2FromBits", simdCvtVToMask(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x2.ToBits", simdCvtMaskToV(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x4.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64)
-	addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x4FromBits", simdCvtVToMask(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x4.ToBits", simdCvtMaskToV(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x8.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64)
-	addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
 	addF(simdPackage, "Mask64x8FromBits", simdCvtVToMask(64, 8), sys.AMD64)
 	addF(simdPackage, "Mask64x8.ToBits", simdCvtMaskToV(64, 8), sys.AMD64)
 }
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@ -80,8 +80,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "{{.VectorCounterpart}}.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "{{.Name}}.And", opLen2(ssa.OpAnd{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
 	addF(simdPackage, "{{.Name}}.Or", opLen2(ssa.OpOr{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
-	addF(simdPackage, "Load{{.Name}}FromBits", simdLoadMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
-	addF(simdPackage, "{{.Name}}.StoreToBits", simdStoreMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
 	addF(simdPackage, "{{.Name}}FromBits", simdCvtVToMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
 	addF(simdPackage, "{{.Name}}.ToBits", simdCvtMaskToV({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
 {{end}}
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@ -180,22 +180,6 @@ func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}
 func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
 `

-const simdMaskFromBitsTemplate = `
-// Load{{.Name}}FromBits constructs a {{.Name}} from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower {{.Lanes}} bits of y are used.
-//
-// CPU Features: AVX512
-//go:noescape
-func Load{{.Name}}FromBits(y *uint64) {{.Name}}
-
-// StoreToBits stores a {{.Name}} as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower {{.Lanes}} bits of y are used.
-//
-// CPU Features: AVX512
-//go:noescape
-func (x {{.Name}}) StoreToBits(y *uint64)
-`
-
 const simdMaskFromValTemplate = `
 // {{.Name}}FromBits constructs a {{.Name}} from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower {{.Lanes}} bits of y are used.
@ -503,7 +487,6 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
 	t := templateOf(simdTypesTemplates, "types_amd64")
 	loadStore := templateOf(simdLoadStoreTemplate, "loadstore_amd64")
 	maskedLoadStore := templateOf(simdMaskedLoadStoreTemplate, "maskedloadstore_amd64")
-	maskFromBits := templateOf(simdMaskFromBitsTemplate, "maskFromBits_amd64")
 	maskFromVal := templateOf(simdMaskFromValTemplate, "maskFromVal_amd64")

 	buffer := new(bytes.Buffer)
@ -542,9 +525,6 @@ func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
 					}
 				}
 			} else {
-				if err := maskFromBits.ExecuteTemplate(buffer, "maskFromBits_amd64", typeDef); err != nil {
-					panic(fmt.Errorf("failed to execute maskFromBits template for type %s: %w", typeDef.Name, err))
-				}
 				if err := maskFromVal.ExecuteTemplate(buffer, "maskFromVal_amd64", typeDef); err != nil {
 					panic(fmt.Errorf("failed to execute maskFromVal template for type %s: %w", typeDef.Name, err))
 				}
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@ -332,39 +332,6 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si
 	}
 }

-func TestBitMaskLoad(t *testing.T) {
-	if !simd.HasAVX512() {
-		t.Skip("Test requires HasAVX512, not available on this hardware")
-		return
-	}
-	var bits uint64 = 0b10
-	results := [2]int64{}
-	want := [2]int64{0, 6}
-	m := simd.LoadMask64x2FromBits(&bits)
-	simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
-	for i := range 2 {
-		if results[i] != want[i] {
-			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
-		}
-	}
-}
-
-func TestBitMaskStore(t *testing.T) {
-	if !simd.HasAVX512() {
-		t.Skip("Test requires HasAVX512, not available on this hardware")
-		return
-	}
-	var want uint64 = 0b101
-	var got uint64
-	x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
-	y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0})
-	m := y.Greater(x)
-	m.StoreToBits(&got)
-	if got != want {
-		t.Errorf("Result incorrect: want %b, got %b", want, got)
-	}
-}
-
 func TestBitMaskFromBits(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Test requires HasAVX512, not available on this hardware")
--- a/src/simd/types_amd64.go
+++ b/src/simd/types_amd64.go
@ -301,22 +301,6 @@ type Mask8x16 struct {
 	vals    [16]int8
 }

-// LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask8x16FromBits(y *uint64) Mask8x16
-
-// StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask8x16) StoreToBits(y *uint64)
-
 // Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
@ -335,22 +319,6 @@ type Mask16x8 struct {
 	vals    [8]int16
 }

-// LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask16x8FromBits(y *uint64) Mask16x8
-
-// StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask16x8) StoreToBits(y *uint64)
-
 // Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
@ -369,22 +337,6 @@ type Mask32x4 struct {
 	vals    [4]int32
 }

-// LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask32x4FromBits(y *uint64) Mask32x4
-
-// StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask32x4) StoreToBits(y *uint64)
-
 // Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
@ -403,22 +355,6 @@ type Mask64x2 struct {
 	vals    [2]int64
 }

-// LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 2 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask64x2FromBits(y *uint64) Mask64x2
-
-// StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 2 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask64x2) StoreToBits(y *uint64)
-
 // Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
 //
@ -728,22 +664,6 @@ type Mask8x32 struct {
 	vals    [32]int8
 }

-// LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask8x32FromBits(y *uint64) Mask8x32
-
-// StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask8x32) StoreToBits(y *uint64)
-
 // Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
@ -762,22 +682,6 @@ type Mask16x16 struct {
 	vals     [16]int16
 }

-// LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask16x16FromBits(y *uint64) Mask16x16
-
-// StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask16x16) StoreToBits(y *uint64)
-
 // Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
@ -796,22 +700,6 @@ type Mask32x8 struct {
 	vals    [8]int32
 }

-// LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask32x8FromBits(y *uint64) Mask32x8
-
-// StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask32x8) StoreToBits(y *uint64)
-
 // Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
@ -830,22 +718,6 @@ type Mask64x4 struct {
 	vals    [4]int64
 }

-// LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask64x4FromBits(y *uint64) Mask64x4
-
-// StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask64x4) StoreToBits(y *uint64)
-
 // Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
@ -1219,22 +1091,6 @@ type Mask8x64 struct {
 	vals    [64]int8
 }

-// LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 64 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask8x64FromBits(y *uint64) Mask8x64
-
-// StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 64 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask8x64) StoreToBits(y *uint64)
-
 // Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 64 bits of y are used.
 //
@ -1253,22 +1109,6 @@ type Mask16x32 struct {
 	vals     [32]int16
 }

-// LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask16x32FromBits(y *uint64) Mask16x32
-
-// StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 32 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask16x32) StoreToBits(y *uint64)
-
 // Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
@ -1287,22 +1127,6 @@ type Mask32x16 struct {
 	vals     [16]int32
 }

-// LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask32x16FromBits(y *uint64) Mask32x16
-
-// StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 16 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask32x16) StoreToBits(y *uint64)
-
 // Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
@ -1321,22 +1145,6 @@ type Mask64x8 struct {
 	vals    [8]int64
 }

-// LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func LoadMask64x8FromBits(y *uint64) Mask64x8
-
-// StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 8 bits of y are used.
-//
-// CPU Features: AVX512
-//
-//go:noescape
-func (x Mask64x8) StoreToBits(y *uint64)
-
 // Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //