diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 5b2df50b13a..9e772a71693 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1494,6 +1494,25 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { ssagen.AddAux(&p.To, v) p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg + case ssa.OpAMD64VPMASK64load512, ssa.OpAMD64VPMASK32load512, ssa.OpAMD64VPMASK16load512, ssa.OpAMD64VPMASK8load512: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v) + p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg + x86.ParseSuffix(p, "Z") // must be zero if not in mask + + case ssa.OpAMD64VPMASK64store512, ssa.OpAMD64VPMASK32store512, ssa.OpAMD64VPMASK16store512, ssa.OpAMD64VPMASK8store512: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = simdReg(v.Args[2]) + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.To, v) + p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg + case ssa.OpAMD64VPMOVMToVec8x16, ssa.OpAMD64VPMOVMToVec8x32, ssa.OpAMD64VPMOVMToVec8x64, diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 1195c0de7f2..5dafc4b563b 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1756,6 +1756,18 @@ (StoreMasked64 {t} ptr mask val mem) && t.Size() == 16 => (VPMASK64store128 ptr mask val mem) (StoreMasked64 {t} ptr mask val mem) && t.Size() == 32 => (VPMASK64store256 ptr mask val mem) +// SIMD vector K-masked loads and stores + +(LoadMasked64 ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM mask) mem) +(LoadMasked32 ptr mask mem) && t.Size() == 64 => (VPMASK32load512 ptr (VPMOVVec32x16ToM mask) mem) +(LoadMasked16 ptr mask mem) && t.Size() == 64 => (VPMASK16load512 ptr (VPMOVVec16x32ToM mask) mem) +(LoadMasked8 ptr mask mem) && t.Size() == 64 => (VPMASK8load512 ptr (VPMOVVec8x64ToM mask) mem) + +(StoreMasked64 {t} ptr mask val mem) && t.Size() == 64 => (VPMASK64store512 ptr (VPMOVVec64x8ToM mask) val mem) +(StoreMasked32 {t} ptr mask val mem) && t.Size() == 64 => (VPMASK32store512 ptr (VPMOVVec32x16ToM mask) val mem) +(StoreMasked16 {t} ptr mask val mem) && t.Size() == 64 => (VPMASK16store512 ptr (VPMOVVec16x32ToM mask) val mem) +(StoreMasked8 {t} ptr mask val mem) && t.Size() == 64 => (VPMASK8store512 ptr (VPMOVVec8x64ToM mask) val mem) + (ZeroSIMD ) && t.Size() == 16 => (Zero128 ) (ZeroSIMD ) && t.Size() == 32 => (Zero256 ) (ZeroSIMD ) && t.Size() == 64 => (Zero512 ) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 8ab0b823511..402f50bfc2c 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -205,8 +205,8 @@ func init() { // masked loads/stores, vector register or mask register vloadv = regInfo{inputs: []regMask{gpspsb, v, 0}, outputs: vonly} vstorev = regInfo{inputs: []regMask{gpspsb, v, v, 0}} - // vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly} - // vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}} + vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly} + vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}} v01 = regInfo{inputs: nil, outputs: vonly} v11 = regInfo{inputs: vonly, outputs: vonly} @@ -1286,7 +1286,7 @@ func init() { {name: "VMOVDQUload512", argLength: 2, reg: fpload, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1 = mem {name: "VMOVDQUstore512", argLength: 3, reg: fpstore, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem - // AVX2 32 and 64-bit element masked moves. + // AVX2 32 and 64-bit element int-vector masked moves. {name: "VPMASK32load128", argLength: 3, reg: vloadv, asm: "VPMASKMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem {name: "VPMASK32store128", argLength: 4, reg: vstorev, asm: "VPMASKMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=integer mask, arg3 = mem {name: "VPMASK64load128", argLength: 3, reg: vloadv, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem @@ -1297,6 +1297,16 @@ func init() { {name: "VPMASK64load256", argLength: 3, reg: vloadv, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem {name: "VPMASK64store256", argLength: 4, reg: vstorev, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=integer mask, arg3 = mem + // AVX512 8-64-bit element mask-register masked moves + {name: "VPMASK8load512", argLength: 3, reg: vloadk, asm: "VMOVDQU8", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=k mask, arg2 = mem + {name: "VPMASK8store512", argLength: 4, reg: vstorek, asm: "VMOVDQU8", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem + {name: "VPMASK16load512", argLength: 3, reg: vloadk, asm: "VMOVDQU16", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=k mask, arg2 = mem + {name: "VPMASK16store512", argLength: 4, reg: vstorek, asm: "VMOVDQU16", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem + {name: "VPMASK32load512", argLength: 3, reg: vloadk, asm: "VMOVDQU32", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=k mask, arg2 = mem + {name: "VPMASK32store512", argLength: 4, reg: vstorek, asm: "VMOVDQU32", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem + {name: "VPMASK64load512", argLength: 3, reg: vloadk, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=k mask, arg2 = mem + {name: "VPMASK64store512", argLength: 4, reg: vstorek, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem + {name: "VPMOVMToVec8x16", argLength: 1, reg: kv, asm: "VPMOVM2B"}, {name: "VPMOVMToVec8x32", argLength: 1, reg: kv, asm: "VPMOVM2B"}, {name: "VPMOVMToVec8x64", argLength: 1, reg: kw, asm: "VPMOVM2B"}, diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index e714e347e2b..34514abc92f 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -375,8 +375,12 @@ var genericOps = []opData{ // masked memory operations. // TODO add 16 and 8 + {name: "LoadMasked8", argLength: 3}, // Load from arg0, arg1 = mask of 8-bits, arg2 = memory + {name: "LoadMasked16", argLength: 3}, // Load from arg0, arg1 = mask of 16-bits, arg2 = memory {name: "LoadMasked32", argLength: 3}, // Load from arg0, arg1 = mask of 32-bits, arg2 = memory {name: "LoadMasked64", argLength: 3}, // Load from arg0, arg1 = mask of 64-bits, arg2 = memory + {name: "StoreMasked8", argLength: 4, typ: "Mem", aux: "Typ"}, // Store arg2 to arg0, arg1=mask of 8-bits, arg3 = memory + {name: "StoreMasked16", argLength: 4, typ: "Mem", aux: "Typ"}, // Store arg2 to arg0, arg1=mask of 16-bits, arg3 = memory {name: "StoreMasked32", argLength: 4, typ: "Mem", aux: "Typ"}, // Store arg2 to arg0, arg1=mask of 32-bits, arg3 = memory {name: "StoreMasked64", argLength: 4, typ: "Mem", aux: "Typ"}, // Store arg2 to arg0, arg1=mask of 64-bits, arg3 = memory diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 61ce06203ab..ed0203b6390 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1177,6 +1177,14 @@ const ( OpAMD64VPMASK32store256 OpAMD64VPMASK64load256 OpAMD64VPMASK64store256 + OpAMD64VPMASK8load512 + OpAMD64VPMASK8store512 + OpAMD64VPMASK16load512 + OpAMD64VPMASK16store512 + OpAMD64VPMASK32load512 + OpAMD64VPMASK32store512 + OpAMD64VPMASK64load512 + OpAMD64VPMASK64store512 OpAMD64VPMOVMToVec8x16 OpAMD64VPMOVMToVec8x32 OpAMD64VPMOVMToVec8x64 @@ -4270,8 +4278,12 @@ const ( OpLoad OpDereference OpStore + OpLoadMasked8 + OpLoadMasked16 OpLoadMasked32 OpLoadMasked64 + OpStoreMasked8 + OpStoreMasked16 OpStoreMasked32 OpStoreMasked64 OpMove @@ -18661,6 +18673,134 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMASK8load512", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVDQU8, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPMASK8store512", + auxType: auxSymOff, + argLen: 4, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AVMOVDQU8, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, + { + name: "VPMASK16load512", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVDQU16, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPMASK16store512", + auxType: auxSymOff, + argLen: 4, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AVMOVDQU16, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, + { + name: "VPMASK32load512", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVDQU32, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPMASK32store512", + auxType: auxSymOff, + argLen: 4, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AVMOVDQU32, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, + { + name: "VPMASK64load512", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVDQU64, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPMASK64store512", + auxType: auxSymOff, + argLen: 4, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AVMOVDQU64, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, { name: "VPMOVMToVec8x16", argLen: 1, @@ -60363,6 +60503,16 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "LoadMasked8", + argLen: 3, + generic: true, + }, + { + name: "LoadMasked16", + argLen: 3, + generic: true, + }, { name: "LoadMasked32", argLen: 3, @@ -60373,6 +60523,18 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "StoreMasked8", + auxType: auxTyp, + argLen: 4, + generic: true, + }, + { + name: "StoreMasked16", + auxType: auxTyp, + argLen: 4, + generic: true, + }, { name: "StoreMasked32", auxType: auxTyp, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index d79c856ae8d..986f2568875 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2516,10 +2516,14 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpLoadMask8x32(v) case OpLoadMask8x64: return rewriteValueAMD64_OpLoadMask8x64(v) + case OpLoadMasked16: + return rewriteValueAMD64_OpLoadMasked16(v) case OpLoadMasked32: return rewriteValueAMD64_OpLoadMasked32(v) case OpLoadMasked64: return rewriteValueAMD64_OpLoadMasked64(v) + case OpLoadMasked8: + return rewriteValueAMD64_OpLoadMasked8(v) case OpLocalAddr: return rewriteValueAMD64_OpLocalAddr(v) case OpLsh16x16: @@ -5266,10 +5270,14 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpStoreMask8x32(v) case OpStoreMask8x64: return rewriteValueAMD64_OpStoreMask8x64(v) + case OpStoreMasked16: + return rewriteValueAMD64_OpStoreMasked16(v) case OpStoreMasked32: return rewriteValueAMD64_OpStoreMasked32(v) case OpStoreMasked64: return rewriteValueAMD64_OpStoreMasked64(v) + case OpStoreMasked8: + return rewriteValueAMD64_OpStoreMasked8(v) case OpSub16: v.Op = OpAMD64SUBL return true @@ -40881,10 +40889,35 @@ func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpLoadMasked16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMasked16 ptr mask mem) + // cond: t.Size() == 64 + // result: (VPMASK16load512 ptr (VPMOVVec16x32ToM mask) mem) + for { + t := v.Type + ptr := v_0 + mask := v_1 + mem := v_2 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK16load512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(ptr, v0, mem) + return true + } + return false +} func rewriteValueAMD64_OpLoadMasked32(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] + b := v.Block // match: (LoadMasked32 ptr mask mem) // cond: t.Size() == 16 // result: (VPMASK32load128 ptr mask mem) @@ -40915,12 +40948,30 @@ func rewriteValueAMD64_OpLoadMasked32(v *Value) bool { v.AddArg3(ptr, mask, mem) return true } + // match: (LoadMasked32 ptr mask mem) + // cond: t.Size() == 64 + // result: (VPMASK32load512 ptr (VPMOVVec32x16ToM mask) mem) + for { + t := v.Type + ptr := v_0 + mask := v_1 + mem := v_2 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK32load512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(ptr, v0, mem) + return true + } return false } func rewriteValueAMD64_OpLoadMasked64(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] + b := v.Block // match: (LoadMasked64 ptr mask mem) // cond: t.Size() == 16 // result: (VPMASK64load128 ptr mask mem) @@ -40951,6 +41002,47 @@ func rewriteValueAMD64_OpLoadMasked64(v *Value) bool { v.AddArg3(ptr, mask, mem) return true } + // match: (LoadMasked64 ptr mask mem) + // cond: t.Size() == 64 + // result: (VPMASK64load512 ptr (VPMOVVec64x8ToM mask) mem) + for { + t := v.Type + ptr := v_0 + mask := v_1 + mem := v_2 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK64load512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(ptr, v0, mem) + return true + } + return false +} +func rewriteValueAMD64_OpLoadMasked8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMasked8 ptr mask mem) + // cond: t.Size() == 64 + // result: (VPMASK8load512 ptr (VPMOVVec8x64ToM mask) mem) + for { + t := v.Type + ptr := v_0 + mask := v_1 + mem := v_2 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK8load512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(ptr, v0, mem) + return true + } return false } func rewriteValueAMD64_OpLocalAddr(v *Value) bool { @@ -53915,11 +54007,38 @@ func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpStoreMasked16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMasked16 {t} ptr mask val mem) + // cond: t.Size() == 64 + // result: (VPMASK16store512 ptr (VPMOVVec16x32ToM mask) val mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + mask := v_1 + val := v_2 + mem := v_3 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK16store512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(ptr, v0, val, mem) + return true + } + return false +} func rewriteValueAMD64_OpStoreMasked32(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] + b := v.Block // match: (StoreMasked32 {t} ptr mask val mem) // cond: t.Size() == 16 // result: (VPMASK32store128 ptr mask val mem) @@ -53952,6 +54071,24 @@ func rewriteValueAMD64_OpStoreMasked32(v *Value) bool { v.AddArg4(ptr, mask, val, mem) return true } + // match: (StoreMasked32 {t} ptr mask val mem) + // cond: t.Size() == 64 + // result: (VPMASK32store512 ptr (VPMOVVec32x16ToM mask) val mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + mask := v_1 + val := v_2 + mem := v_3 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK32store512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(ptr, v0, val, mem) + return true + } return false } func rewriteValueAMD64_OpStoreMasked64(v *Value) bool { @@ -53959,6 +54096,7 @@ func rewriteValueAMD64_OpStoreMasked64(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] + b := v.Block // match: (StoreMasked64 {t} ptr mask val mem) // cond: t.Size() == 16 // result: (VPMASK64store128 ptr mask val mem) @@ -53991,6 +54129,50 @@ func rewriteValueAMD64_OpStoreMasked64(v *Value) bool { v.AddArg4(ptr, mask, val, mem) return true } + // match: (StoreMasked64 {t} ptr mask val mem) + // cond: t.Size() == 64 + // result: (VPMASK64store512 ptr (VPMOVVec64x8ToM mask) val mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + mask := v_1 + val := v_2 + mem := v_3 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK64store512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(ptr, v0, val, mem) + return true + } + return false +} +func rewriteValueAMD64_OpStoreMasked8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (StoreMasked8 {t} ptr mask val mem) + // cond: t.Size() == 64 + // result: (VPMASK8store512 ptr (VPMOVVec8x64ToM mask) val mem) + for { + t := auxToType(v.Aux) + ptr := v_0 + mask := v_1 + val := v_2 + mem := v_3 + if !(t.Size() == 64) { + break + } + v.reset(OpAMD64VPMASK8store512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(ptr, v0, val, mem) + return true + } return false } func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool {