diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 8847580e254..9a4203f7c67 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1715,7 +1715,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() - case ssa.OpAMD64KMOVQ, ssa.OpAMD64KMOVD, ssa.OpAMD64KMOVW, ssa.OpAMD64KMOVB: + case ssa.OpAMD64KMOVQk, ssa.OpAMD64KMOVDk, ssa.OpAMD64KMOVWk, ssa.OpAMD64KMOVBk, + ssa.OpAMD64KMOVQi, ssa.OpAMD64KMOVDi, ssa.OpAMD64KMOVWi, ssa.OpAMD64KMOVBi: // See also ssa.OpAMD64KMOVQload p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index dd9deef4afb..8da4a031b47 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1669,21 +1669,21 @@ // XXX SIMD // Mask loads -(LoadMask8x16 ptr mem) => (VPMOVMToVec8x16 (KMOVQload ptr mem)) -(LoadMask8x32 ptr mem) => (VPMOVMToVec8x32 (KMOVQload ptr mem)) -(LoadMask8x64 ptr mem) => (VPMOVMToVec8x64 (KMOVQload ptr mem)) +(LoadMask8x16 ptr mem) => (VPMOVMToVec8x16 (KMOVQload ptr mem)) +(LoadMask8x32 ptr mem) => (VPMOVMToVec8x32 (KMOVQload ptr mem)) +(LoadMask8x64 ptr mem) => (VPMOVMToVec8x64 (KMOVQload ptr mem)) -(LoadMask16x8 ptr mem) => (VPMOVMToVec16x8 (KMOVQload ptr mem)) -(LoadMask16x16 ptr mem) => (VPMOVMToVec16x16 (KMOVQload ptr mem)) -(LoadMask16x32 ptr mem) => (VPMOVMToVec16x32 (KMOVQload ptr mem)) +(LoadMask16x8 ptr mem) => (VPMOVMToVec16x8 (KMOVQload ptr mem)) +(LoadMask16x16 ptr mem) => (VPMOVMToVec16x16 (KMOVQload ptr mem)) +(LoadMask16x32 ptr mem) => (VPMOVMToVec16x32 (KMOVQload ptr mem)) -(LoadMask32x4 ptr mem) => (VPMOVMToVec32x4 (KMOVQload ptr mem)) -(LoadMask32x8 ptr mem) => (VPMOVMToVec32x8 (KMOVQload ptr mem)) -(LoadMask32x16 ptr mem) => (VPMOVMToVec32x16 (KMOVQload ptr mem)) +(LoadMask32x4 ptr mem) => (VPMOVMToVec32x4 (KMOVQload ptr mem)) +(LoadMask32x8 ptr mem) => (VPMOVMToVec32x8 (KMOVQload ptr mem)) +(LoadMask32x16 ptr mem) => (VPMOVMToVec32x16 (KMOVQload ptr mem)) -(LoadMask64x2 ptr mem) => (VPMOVMToVec64x2 (KMOVQload ptr mem)) -(LoadMask64x4 ptr mem) => (VPMOVMToVec64x4 (KMOVQload ptr mem)) -(LoadMask64x8 ptr mem) => (VPMOVMToVec64x8 (KMOVQload ptr mem)) +(LoadMask64x2 ptr mem) => (VPMOVMToVec64x2 (KMOVQload ptr mem)) +(LoadMask64x4 ptr mem) => (VPMOVMToVec64x4 (KMOVQload ptr mem)) +(LoadMask64x8 ptr mem) => (VPMOVMToVec64x8 (KMOVQload ptr mem)) (StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM val) mem) (StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM val) mem) @@ -1703,22 +1703,40 @@ // TODO is this correct? Should we just do it all from 64-bits? -// Mask conversions (from integers) -(Cvt16toMask8x16 x) => (VPMOVMToVec8x16 (KMOVW x)) -(Cvt32toMask8x32 x) => (VPMOVMToVec8x32 (KMOVD x)) -(Cvt64toMask8x64 x) => (VPMOVMToVec8x64 (KMOVQ x)) +// Mask conversions +// integers to masks +(Cvt16toMask8x16 x) => (VPMOVMToVec8x16 (KMOVWk x)) +(Cvt32toMask8x32 x) => (VPMOVMToVec8x32 (KMOVDk x)) +(Cvt64toMask8x64 x) => (VPMOVMToVec8x64 (KMOVQk x)) -(Cvt8toMask16x8 x) => (VPMOVMToVec16x8 (KMOVB x)) -(Cvt16toMask16x16 x) => (VPMOVMToVec16x16 (KMOVW x)) -(Cvt32toMask16x32 x) => (VPMOVMToVec16x32 (KMOVD x)) +(Cvt8toMask16x8 x) => (VPMOVMToVec16x8 (KMOVBk x)) +(Cvt16toMask16x16 x) => (VPMOVMToVec16x16 (KMOVWk x)) +(Cvt32toMask16x32 x) => (VPMOVMToVec16x32 (KMOVDk x)) -(Cvt8toMask32x4 x) => (VPMOVMToVec32x4 (KMOVB x)) -(Cvt8toMask32x8 x) => (VPMOVMToVec32x8 (KMOVB x)) -(Cvt16toMask32x16 x) => (VPMOVMToVec32x16 (KMOVW x)) +(Cvt8toMask32x4 x) => (VPMOVMToVec32x4 (KMOVBk x)) +(Cvt8toMask32x8 x) => (VPMOVMToVec32x8 (KMOVBk x)) +(Cvt16toMask32x16 x) => (VPMOVMToVec32x16 (KMOVWk x)) -(Cvt8toMask64x2 x) => (VPMOVMToVec64x2 (KMOVB x)) -(Cvt8toMask64x4 x) => (VPMOVMToVec64x4 (KMOVB x)) -(Cvt8toMask64x8 x) => (VPMOVMToVec64x8 (KMOVB x)) +(Cvt8toMask64x2 x) => (VPMOVMToVec64x2 (KMOVBk x)) +(Cvt8toMask64x4 x) => (VPMOVMToVec64x4 (KMOVBk x)) +(Cvt8toMask64x8 x) => (VPMOVMToVec64x8 (KMOVBk x)) + +// masks to integers +(CvtMask8x16to16 x) => (KMOVWi (VPMOVVec8x16ToM x)) +(CvtMask8x32to32 x) => (KMOVDi (VPMOVVec8x32ToM x)) +(CvtMask8x64to64 x) => (KMOVQi (VPMOVVec8x64ToM x)) + +(CvtMask16x8to8 x) => (KMOVBi (VPMOVVec16x8ToM x)) +(CvtMask16x16to16 x) => (KMOVWi (VPMOVVec16x16ToM x)) +(CvtMask16x32to32 x) => (KMOVDi (VPMOVVec16x32ToM x)) + +(CvtMask32x4to8 x) => (KMOVBi (VPMOVVec32x4ToM x)) +(CvtMask32x8to8 x) => (KMOVBi (VPMOVVec32x8ToM x)) +(CvtMask32x16to16 x) => (KMOVWi (VPMOVVec32x16ToM x)) + +(CvtMask64x2to8 x) => (KMOVBi (VPMOVVec64x2ToM x)) +(CvtMask64x4to8 x) => (KMOVBi (VPMOVVec64x4ToM x)) +(CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM x)) // SIMD vector loads and stores (Load ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index bc30e6574fe..fdc80c9a805 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -242,6 +242,7 @@ func init() { kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}} gpk = regInfo{inputs: gponly, outputs: maskonly} + kgp = regInfo{inputs: maskonly, outputs: gponly} prefreg = regInfo{inputs: []regMask{gpspsbg}} ) @@ -1367,10 +1368,14 @@ func init() { {name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // Move GP directly to mask register - {name: "KMOVQ", argLength: 1, reg: gpk, asm: "KMOVQ"}, - {name: "KMOVD", argLength: 1, reg: gpk, asm: "KMOVD"}, - {name: "KMOVW", argLength: 1, reg: gpk, asm: "KMOVW"}, - {name: "KMOVB", argLength: 1, reg: gpk, asm: "KMOVB"}, + {name: "KMOVQk", argLength: 1, reg: gpk, asm: "KMOVQ"}, + {name: "KMOVDk", argLength: 1, reg: gpk, asm: "KMOVD"}, + {name: "KMOVWk", argLength: 1, reg: gpk, asm: "KMOVW"}, + {name: "KMOVBk", argLength: 1, reg: gpk, asm: "KMOVB"}, + {name: "KMOVQi", argLength: 1, reg: kgp, asm: "KMOVQ"}, + {name: "KMOVDi", argLength: 1, reg: kgp, asm: "KMOVD"}, + {name: "KMOVWi", argLength: 1, reg: kgp, asm: "KMOVW"}, + {name: "KMOVBi", argLength: 1, reg: kgp, asm: "KMOVB"}, } var AMD64blocks = []blockData{ diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index 34514abc92f..26f3e758bdf 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -717,6 +717,20 @@ var genericOps = []opData{ {name: "Cvt8toMask64x2", argLength: 1}, // arg0 = integer mask value {name: "Cvt8toMask64x4", argLength: 1}, // arg0 = integer mask value {name: "Cvt8toMask64x8", argLength: 1}, // arg0 = integer mask value + + // Convert masks to integers + {name: "CvtMask8x16to16", argLength: 1}, // arg0 = mask + {name: "CvtMask8x32to32", argLength: 1}, // arg0 = mask + {name: "CvtMask8x64to64", argLength: 1}, // arg0 = mask + {name: "CvtMask16x8to8", argLength: 1}, // arg0 = mask + {name: "CvtMask16x16to16", argLength: 1}, // arg0 = mask + {name: "CvtMask16x32to32", argLength: 1}, // arg0 = mask + {name: "CvtMask32x4to8", argLength: 1}, // arg0 = mask + {name: "CvtMask32x8to8", argLength: 1}, // arg0 = mask + {name: "CvtMask32x16to16", argLength: 1}, // arg0 = mask + {name: "CvtMask64x2to8", argLength: 1}, // arg0 = mask + {name: "CvtMask64x4to8", argLength: 1}, // arg0 = mask + {name: "CvtMask64x8to8", argLength: 1}, // arg0 = mask } // kind controls successors implicit exit diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 2fafe10ea51..7c135ea692c 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1218,10 +1218,14 @@ const ( OpAMD64VZEROALL OpAMD64KMOVQload OpAMD64KMOVQstore - OpAMD64KMOVQ - OpAMD64KMOVD - OpAMD64KMOVW - OpAMD64KMOVB + OpAMD64KMOVQk + OpAMD64KMOVDk + OpAMD64KMOVWk + OpAMD64KMOVBk + OpAMD64KMOVQi + OpAMD64KMOVDi + OpAMD64KMOVWi + OpAMD64KMOVBi OpAMD64VADDPD128 OpAMD64VADDPD256 OpAMD64VADDPD512 @@ -4582,6 +4586,18 @@ const ( OpCvt8toMask64x2 OpCvt8toMask64x4 OpCvt8toMask64x8 + OpCvtMask8x16to16 + OpCvtMask8x32to32 + OpCvtMask8x64to64 + OpCvtMask16x8to8 + OpCvtMask16x16to16 + OpCvtMask16x32to32 + OpCvtMask32x4to8 + OpCvtMask32x8to8 + OpCvtMask32x16to16 + OpCvtMask64x2to8 + OpCvtMask64x4to8 + OpCvtMask64x8to8 OpAbsoluteInt8x16 OpAbsoluteInt8x32 OpAbsoluteInt8x64 @@ -19400,7 +19416,7 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "KMOVQ", + name: "KMOVQk", argLen: 1, asm: x86.AKMOVQ, reg: regInfo{ @@ -19413,7 +19429,7 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "KMOVD", + name: "KMOVDk", argLen: 1, asm: x86.AKMOVD, reg: regInfo{ @@ -19426,7 +19442,7 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "KMOVW", + name: "KMOVWk", argLen: 1, asm: x86.AKMOVW, reg: regInfo{ @@ -19439,7 +19455,7 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "KMOVB", + name: "KMOVBk", argLen: 1, asm: x86.AKMOVB, reg: regInfo{ @@ -19451,6 +19467,58 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "KMOVQi", + argLen: 1, + asm: x86.AKMOVQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "KMOVDi", + argLen: 1, + asm: x86.AKMOVD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "KMOVWi", + argLen: 1, + asm: x86.AKMOVW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "KMOVBi", + argLen: 1, + asm: x86.AKMOVB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, { name: "VADDPD128", argLen: 2, @@ -63129,6 +63197,66 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "CvtMask8x16to16", + argLen: 1, + generic: true, + }, + { + name: "CvtMask8x32to32", + argLen: 1, + generic: true, + }, + { + name: "CvtMask8x64to64", + argLen: 1, + generic: true, + }, + { + name: "CvtMask16x8to8", + argLen: 1, + generic: true, + }, + { + name: "CvtMask16x16to16", + argLen: 1, + generic: true, + }, + { + name: "CvtMask16x32to32", + argLen: 1, + generic: true, + }, + { + name: "CvtMask32x4to8", + argLen: 1, + generic: true, + }, + { + name: "CvtMask32x8to8", + argLen: 1, + generic: true, + }, + { + name: "CvtMask32x16to16", + argLen: 1, + generic: true, + }, + { + name: "CvtMask64x2to8", + argLen: 1, + generic: true, + }, + { + name: "CvtMask64x4to8", + argLen: 1, + generic: true, + }, + { + name: "CvtMask64x8to8", + argLen: 1, + generic: true, + }, { name: "AbsoluteInt8x16", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 6b63b702459..eacb30768f8 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1541,6 +1541,30 @@ func rewriteValueAMD64(v *Value) bool { case OpCvtBoolToUint8: v.Op = OpCopy return true + case OpCvtMask16x16to16: + return rewriteValueAMD64_OpCvtMask16x16to16(v) + case OpCvtMask16x32to32: + return rewriteValueAMD64_OpCvtMask16x32to32(v) + case OpCvtMask16x8to8: + return rewriteValueAMD64_OpCvtMask16x8to8(v) + case OpCvtMask32x16to16: + return rewriteValueAMD64_OpCvtMask32x16to16(v) + case OpCvtMask32x4to8: + return rewriteValueAMD64_OpCvtMask32x4to8(v) + case OpCvtMask32x8to8: + return rewriteValueAMD64_OpCvtMask32x8to8(v) + case OpCvtMask64x2to8: + return rewriteValueAMD64_OpCvtMask64x2to8(v) + case OpCvtMask64x4to8: + return rewriteValueAMD64_OpCvtMask64x4to8(v) + case OpCvtMask64x8to8: + return rewriteValueAMD64_OpCvtMask64x8to8(v) + case OpCvtMask8x16to16: + return rewriteValueAMD64_OpCvtMask8x16to16(v) + case OpCvtMask8x32to32: + return rewriteValueAMD64_OpCvtMask8x32to32(v) + case OpCvtMask8x64to64: + return rewriteValueAMD64_OpCvtMask8x64to64(v) case OpDiv128u: v.Op = OpAMD64DIVQU2 return true @@ -33047,12 +33071,13 @@ func rewriteValueAMD64_OpCvt16toMask16x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt16toMask16x16 x) - // result: (VPMOVMToVec16x16 (KMOVW x)) + // result: (VPMOVMToVec16x16 (KMOVWk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec16x16) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVW, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVWk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33062,12 +33087,13 @@ func rewriteValueAMD64_OpCvt16toMask32x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt16toMask32x16 x) - // result: (VPMOVMToVec32x16 (KMOVW x)) + // result: (VPMOVMToVec32x16 (KMOVWk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec32x16) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVW, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVWk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33077,12 +33103,13 @@ func rewriteValueAMD64_OpCvt16toMask8x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt16toMask8x16 x) - // result: (VPMOVMToVec8x16 (KMOVW x)) + // result: (VPMOVMToVec8x16 (KMOVWk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec8x16) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVW, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVWk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33092,12 +33119,13 @@ func rewriteValueAMD64_OpCvt32toMask16x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt32toMask16x32 x) - // result: (VPMOVMToVec16x32 (KMOVD x)) + // result: (VPMOVMToVec16x32 (KMOVDk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec16x32) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVD, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVDk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33107,12 +33135,13 @@ func rewriteValueAMD64_OpCvt32toMask8x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt32toMask8x32 x) - // result: (VPMOVMToVec8x32 (KMOVD x)) + // result: (VPMOVMToVec8x32 (KMOVDk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec8x32) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVD, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVDk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33122,12 +33151,13 @@ func rewriteValueAMD64_OpCvt64toMask8x64(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt64toMask8x64 x) - // result: (VPMOVMToVec8x64 (KMOVQ x)) + // result: (VPMOVMToVec8x64 (KMOVQk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec8x64) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQ, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33137,12 +33167,13 @@ func rewriteValueAMD64_OpCvt8toMask16x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt8toMask16x8 x) - // result: (VPMOVMToVec16x8 (KMOVB x)) + // result: (VPMOVMToVec16x8 (KMOVBk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec16x8) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVBk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33152,12 +33183,13 @@ func rewriteValueAMD64_OpCvt8toMask32x4(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt8toMask32x4 x) - // result: (VPMOVMToVec32x4 (KMOVB x)) + // result: (VPMOVMToVec32x4 (KMOVBk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec32x4) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVBk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33167,12 +33199,13 @@ func rewriteValueAMD64_OpCvt8toMask32x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt8toMask32x8 x) - // result: (VPMOVMToVec32x8 (KMOVB x)) + // result: (VPMOVMToVec32x8 (KMOVBk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec32x8) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVBk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33182,12 +33215,13 @@ func rewriteValueAMD64_OpCvt8toMask64x2(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt8toMask64x2 x) - // result: (VPMOVMToVec64x2 (KMOVB x)) + // result: (VPMOVMToVec64x2 (KMOVBk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec64x2) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVBk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33197,12 +33231,13 @@ func rewriteValueAMD64_OpCvt8toMask64x4(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt8toMask64x4 x) - // result: (VPMOVMToVec64x4 (KMOVB x)) + // result: (VPMOVMToVec64x4 (KMOVBk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec64x4) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVBk, t) v0.AddArg(x) v.AddArg(v0) return true @@ -33212,12 +33247,205 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (Cvt8toMask64x8 x) - // result: (VPMOVMToVec64x8 (KMOVB x)) + // result: (VPMOVMToVec64x8 (KMOVBk x)) for { + t := v.Type x := v_0 v.reset(OpAMD64VPMOVMToVec64x8) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVB, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVBk, t) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask16x16to16 x) + // result: (KMOVWi (VPMOVVec16x16ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVWi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask16x32to32 x) + // result: (KMOVDi (VPMOVVec16x32ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVDi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask16x8to8 x) + // result: (KMOVBi (VPMOVVec16x8ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVBi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask32x16to16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask32x16to16 x) + // result: (KMOVWi (VPMOVVec32x16ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVWi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask32x4to8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask32x4to8 x) + // result: (KMOVBi (VPMOVVec32x4ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVBi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask32x8to8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask32x8to8 x) + // result: (KMOVBi (VPMOVVec32x8ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVBi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask64x2to8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask64x2to8 x) + // result: (KMOVBi (VPMOVVec64x2ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVBi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask64x4to8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask64x4to8 x) + // result: (KMOVBi (VPMOVVec64x4ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVBi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask64x8to8(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask64x8to8 x) + // result: (KMOVBi (VPMOVVec64x8ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVBi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask8x16to16(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask8x16to16 x) + // result: (KMOVWi (VPMOVVec8x16ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVWi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask8x32to32(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask8x32to32 x) + // result: (KMOVDi (VPMOVVec8x32ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVDi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpCvtMask8x64to64(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (CvtMask8x64to64 x) + // result: (KMOVQi (VPMOVVec8x64ToM x)) + for { + t := v.Type + x := v_0 + v.reset(OpAMD64KMOVQi) + v.Type = t + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) return true @@ -41827,13 +42055,14 @@ func rewriteValueAMD64_OpLoadMask16x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask16x16 ptr mem) - // result: (VPMOVMToVec16x16 (KMOVQload ptr mem)) + // result: (VPMOVMToVec16x16 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec16x16) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41844,13 +42073,14 @@ func rewriteValueAMD64_OpLoadMask16x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask16x32 ptr mem) - // result: (VPMOVMToVec16x32 (KMOVQload ptr mem)) + // result: (VPMOVMToVec16x32 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec16x32) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41861,13 +42091,14 @@ func rewriteValueAMD64_OpLoadMask16x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask16x8 ptr mem) - // result: (VPMOVMToVec16x8 (KMOVQload ptr mem)) + // result: (VPMOVMToVec16x8 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec16x8) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41878,13 +42109,14 @@ func rewriteValueAMD64_OpLoadMask32x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask32x16 ptr mem) - // result: (VPMOVMToVec32x16 (KMOVQload ptr mem)) + // result: (VPMOVMToVec32x16 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec32x16) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41895,13 +42127,14 @@ func rewriteValueAMD64_OpLoadMask32x4(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask32x4 ptr mem) - // result: (VPMOVMToVec32x4 (KMOVQload ptr mem)) + // result: (VPMOVMToVec32x4 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec32x4) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41912,13 +42145,14 @@ func rewriteValueAMD64_OpLoadMask32x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask32x8 ptr mem) - // result: (VPMOVMToVec32x8 (KMOVQload ptr mem)) + // result: (VPMOVMToVec32x8 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec32x8) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41929,13 +42163,14 @@ func rewriteValueAMD64_OpLoadMask64x2(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask64x2 ptr mem) - // result: (VPMOVMToVec64x2 (KMOVQload ptr mem)) + // result: (VPMOVMToVec64x2 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec64x2) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41946,13 +42181,14 @@ func rewriteValueAMD64_OpLoadMask64x4(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask64x4 ptr mem) - // result: (VPMOVMToVec64x4 (KMOVQload ptr mem)) + // result: (VPMOVMToVec64x4 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec64x4) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41963,13 +42199,14 @@ func rewriteValueAMD64_OpLoadMask64x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask64x8 ptr mem) - // result: (VPMOVMToVec64x8 (KMOVQload ptr mem)) + // result: (VPMOVMToVec64x8 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec64x8) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41980,13 +42217,14 @@ func rewriteValueAMD64_OpLoadMask8x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask8x16 ptr mem) - // result: (VPMOVMToVec8x16 (KMOVQload ptr mem)) + // result: (VPMOVMToVec8x16 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec8x16) v.Type = types.TypeVec128 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -41997,13 +42235,14 @@ func rewriteValueAMD64_OpLoadMask8x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask8x32 ptr mem) - // result: (VPMOVMToVec8x32 (KMOVQload ptr mem)) + // result: (VPMOVMToVec8x32 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec8x32) v.Type = types.TypeVec256 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true @@ -42014,13 +42253,14 @@ func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (LoadMask8x64 ptr mem) - // result: (VPMOVMToVec8x64 (KMOVQload ptr mem)) + // result: (VPMOVMToVec8x64 (KMOVQload ptr mem)) for { + t := v.Type ptr := v_0 mem := v_1 v.reset(OpAMD64VPMOVMToVec8x64) v.Type = types.TypeVec512 - v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, types.TypeMask) + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) v0.AddArg2(ptr, mem) v.AddArg(v0) return true diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index eae754da4e8..45ccb9c9998 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1782,13 +1782,20 @@ var loadMaskOpcodes = map[int]map[int]ssa.Op{ 64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8}, } -var cvtMaskOpcodes = map[int]map[int]ssa.Op{ +var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{ 8: {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64}, 16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32}, 32: {4: ssa.OpCvt8toMask32x4, 8: ssa.OpCvt8toMask32x8, 16: ssa.OpCvt16toMask32x16}, 64: {2: ssa.OpCvt8toMask64x2, 4: ssa.OpCvt8toMask64x4, 8: ssa.OpCvt8toMask64x8}, } +var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{ + 8: {16: ssa.OpCvtMask8x16to16, 32: ssa.OpCvtMask8x32to32, 64: ssa.OpCvtMask8x64to64}, + 16: {8: ssa.OpCvtMask16x8to8, 16: ssa.OpCvtMask16x16to16, 32: ssa.OpCvtMask16x32to32}, + 32: {4: ssa.OpCvtMask32x4to8, 8: ssa.OpCvtMask32x8to8, 16: ssa.OpCvtMask32x16to16}, + 64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8}, +} + func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { op := loadMaskOpcodes[elemBits][lanes] @@ -1816,9 +1823,9 @@ func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*s } } -func simdCvtMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { +func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - op := cvtMaskOpcodes[elemBits][lanes] + op := cvtVToMaskOpcodes[elemBits][lanes] if op == 0 { panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes)) } @@ -1826,6 +1833,16 @@ func simdCvtMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa } } +func simdCvtMaskToV(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + op := cvtMaskToVOpcodes[elemBits][lanes] + if op == 0 { + panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes)) + } + return s.newValue1(op, n.Type(), args[0]) + } +} + func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(op, n.Type(), args[0], args[1], s.mem()) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 0f65b4500a1..c7f97e03a0d 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -2314,82 +2314,94 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64) addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64) - addF(simdPackage, "Mask8x16FromBits", simdCvtMask(8, 16), sys.AMD64) + addF(simdPackage, "Mask8x16FromBits", simdCvtVToMask(8, 16), sys.AMD64) + addF(simdPackage, "Mask8x16.ToBits", simdCvtMaskToV(8, 16), sys.AMD64) addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64) addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64) - addF(simdPackage, "Mask8x32FromBits", simdCvtMask(8, 32), sys.AMD64) + addF(simdPackage, "Mask8x32FromBits", simdCvtVToMask(8, 32), sys.AMD64) + addF(simdPackage, "Mask8x32.ToBits", simdCvtMaskToV(8, 32), sys.AMD64) addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64) addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64) - addF(simdPackage, "Mask8x64FromBits", simdCvtMask(8, 64), sys.AMD64) + addF(simdPackage, "Mask8x64FromBits", simdCvtVToMask(8, 64), sys.AMD64) + addF(simdPackage, "Mask8x64.ToBits", simdCvtMaskToV(8, 64), sys.AMD64) addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64) addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64) - addF(simdPackage, "Mask16x8FromBits", simdCvtMask(16, 8), sys.AMD64) + addF(simdPackage, "Mask16x8FromBits", simdCvtVToMask(16, 8), sys.AMD64) + addF(simdPackage, "Mask16x8.ToBits", simdCvtMaskToV(16, 8), sys.AMD64) addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64) addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64) - addF(simdPackage, "Mask16x16FromBits", simdCvtMask(16, 16), sys.AMD64) + addF(simdPackage, "Mask16x16FromBits", simdCvtVToMask(16, 16), sys.AMD64) + addF(simdPackage, "Mask16x16.ToBits", simdCvtMaskToV(16, 16), sys.AMD64) addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64) addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64) - addF(simdPackage, "Mask16x32FromBits", simdCvtMask(16, 32), sys.AMD64) + addF(simdPackage, "Mask16x32FromBits", simdCvtVToMask(16, 32), sys.AMD64) + addF(simdPackage, "Mask16x32.ToBits", simdCvtMaskToV(16, 32), sys.AMD64) addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64) addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64) - addF(simdPackage, "Mask32x4FromBits", simdCvtMask(32, 4), sys.AMD64) + addF(simdPackage, "Mask32x4FromBits", simdCvtVToMask(32, 4), sys.AMD64) + addF(simdPackage, "Mask32x4.ToBits", simdCvtMaskToV(32, 4), sys.AMD64) addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64) addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64) - addF(simdPackage, "Mask32x8FromBits", simdCvtMask(32, 8), sys.AMD64) + addF(simdPackage, "Mask32x8FromBits", simdCvtVToMask(32, 8), sys.AMD64) + addF(simdPackage, "Mask32x8.ToBits", simdCvtMaskToV(32, 8), sys.AMD64) addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64) addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64) - addF(simdPackage, "Mask32x16FromBits", simdCvtMask(32, 16), sys.AMD64) + addF(simdPackage, "Mask32x16FromBits", simdCvtVToMask(32, 16), sys.AMD64) + addF(simdPackage, "Mask32x16.ToBits", simdCvtMaskToV(32, 16), sys.AMD64) addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64) addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64) - addF(simdPackage, "Mask64x2FromBits", simdCvtMask(64, 2), sys.AMD64) + addF(simdPackage, "Mask64x2FromBits", simdCvtVToMask(64, 2), sys.AMD64) + addF(simdPackage, "Mask64x2.ToBits", simdCvtMaskToV(64, 2), sys.AMD64) addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64) addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64) - addF(simdPackage, "Mask64x4FromBits", simdCvtMask(64, 4), sys.AMD64) + addF(simdPackage, "Mask64x4FromBits", simdCvtVToMask(64, 4), sys.AMD64) + addF(simdPackage, "Mask64x4.ToBits", simdCvtMaskToV(64, 4), sys.AMD64) addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64) addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64) - addF(simdPackage, "Mask64x8FromBits", simdCvtMask(64, 8), sys.AMD64) + addF(simdPackage, "Mask64x8FromBits", simdCvtVToMask(64, 8), sys.AMD64) + addF(simdPackage, "Mask64x8.ToBits", simdCvtMaskToV(64, 8), sys.AMD64) } diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 9e9b45b5b8e..7776a8afdaa 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -391,3 +391,13 @@ func TestBitMaskFromBits(t *testing.T) { } } } + +func TestBitMaskToBits(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + if v := simd.LoadInt16x8Slice([]int16{-1, 0, -1, 0, 0, 0, 0, 0}).AsMask16x8().ToBits(); v != 0b101 { + t.Errorf("Want 0b101, got %b", v) + } +} diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go index ac8cf3c210a..f70a6a214b5 100644 --- a/src/simd/types_amd64.go +++ b/src/simd/types_amd64.go @@ -320,9 +320,15 @@ func (x Mask8x16) StoreToBits(y *uint64) // Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. // -// Asm: KMOVB, CPU Feature: AVX512" +// Asm: KMOVB, CPU Feature: AVX512 func Mask8x16FromBits(y uint16) Mask8x16 +// ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +// Asm: KMOVB, CPU Features: AVX512 +func (x Mask8x16) ToBits() uint16 + // Mask16x8 is a 128-bit SIMD vector of 8 int16 type Mask16x8 struct { int16x8 v128 @@ -348,9 +354,15 @@ func (x Mask16x8) StoreToBits(y *uint64) // Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. // -// Asm: KMOVW, CPU Feature: AVX512" +// Asm: KMOVW, CPU Feature: AVX512 func Mask16x8FromBits(y uint8) Mask16x8 +// ToBits constructs a bitmap from a Mask16x8, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +// Asm: KMOVW, CPU Features: AVX512 +func (x Mask16x8) ToBits() uint8 + // Mask32x4 is a 128-bit SIMD vector of 4 int32 type Mask32x4 struct { int32x4 v128 @@ -376,9 +388,15 @@ func (x Mask32x4) StoreToBits(y *uint64) // Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. // -// Asm: KMOVD, CPU Feature: AVX512" +// Asm: KMOVD, CPU Feature: AVX512 func Mask32x4FromBits(y uint8) Mask32x4 +// ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +// +// Asm: KMOVD, CPU Features: AVX512 +func (x Mask32x4) ToBits() uint8 + // Mask64x2 is a 128-bit SIMD vector of 2 int64 type Mask64x2 struct { int64x2 v128 @@ -404,9 +422,15 @@ func (x Mask64x2) StoreToBits(y *uint64) // Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 2 bits of y are used. // -// Asm: KMOVQ, CPU Feature: AVX512" +// Asm: KMOVQ, CPU Feature: AVX512 func Mask64x2FromBits(y uint8) Mask64x2 +// ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset. +// Only the lower 2 bits of y are used. +// +// Asm: KMOVQ, CPU Features: AVX512 +func (x Mask64x2) ToBits() uint8 + // v256 is a tag type that tells the compiler that this is really 256-bit SIMD type v256 struct { _256 struct{} @@ -723,9 +747,15 @@ func (x Mask8x32) StoreToBits(y *uint64) // Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 32 bits of y are used. // -// Asm: KMOVB, CPU Feature: AVX512" +// Asm: KMOVB, CPU Feature: AVX512 func Mask8x32FromBits(y uint32) Mask8x32 +// ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +// +// Asm: KMOVB, CPU Features: AVX512 +func (x Mask8x32) ToBits() uint32 + // Mask16x16 is a 256-bit SIMD vector of 16 int16 type Mask16x16 struct { int16x16 v256 @@ -751,9 +781,15 @@ func (x Mask16x16) StoreToBits(y *uint64) // Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. // -// Asm: KMOVW, CPU Feature: AVX512" +// Asm: KMOVW, CPU Feature: AVX512 func Mask16x16FromBits(y uint16) Mask16x16 +// ToBits constructs a bitmap from a Mask16x16, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +// Asm: KMOVW, CPU Features: AVX512 +func (x Mask16x16) ToBits() uint16 + // Mask32x8 is a 256-bit SIMD vector of 8 int32 type Mask32x8 struct { int32x8 v256 @@ -779,9 +815,15 @@ func (x Mask32x8) StoreToBits(y *uint64) // Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. // -// Asm: KMOVD, CPU Feature: AVX512" +// Asm: KMOVD, CPU Feature: AVX512 func Mask32x8FromBits(y uint8) Mask32x8 +// ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +// Asm: KMOVD, CPU Features: AVX512 +func (x Mask32x8) ToBits() uint8 + // Mask64x4 is a 256-bit SIMD vector of 4 int64 type Mask64x4 struct { int64x4 v256 @@ -807,9 +849,15 @@ func (x Mask64x4) StoreToBits(y *uint64) // Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. // -// Asm: KMOVQ, CPU Feature: AVX512" +// Asm: KMOVQ, CPU Feature: AVX512 func Mask64x4FromBits(y uint8) Mask64x4 +// ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +// +// Asm: KMOVQ, CPU Features: AVX512 +func (x Mask64x4) ToBits() uint8 + // v512 is a tag type that tells the compiler that this is really 512-bit SIMD type v512 struct { _512 struct{} @@ -1190,9 +1238,15 @@ func (x Mask8x64) StoreToBits(y *uint64) // Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 64 bits of y are used. // -// Asm: KMOVB, CPU Feature: AVX512" +// Asm: KMOVB, CPU Feature: AVX512 func Mask8x64FromBits(y uint64) Mask8x64 +// ToBits constructs a bitmap from a Mask8x64, where 1 means set for the indexed element, 0 means unset. +// Only the lower 64 bits of y are used. +// +// Asm: KMOVB, CPU Features: AVX512 +func (x Mask8x64) ToBits() uint64 + // Mask16x32 is a 512-bit SIMD vector of 32 int16 type Mask16x32 struct { int16x32 v512 @@ -1218,9 +1272,15 @@ func (x Mask16x32) StoreToBits(y *uint64) // Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 32 bits of y are used. // -// Asm: KMOVW, CPU Feature: AVX512" +// Asm: KMOVW, CPU Feature: AVX512 func Mask16x32FromBits(y uint32) Mask16x32 +// ToBits constructs a bitmap from a Mask16x32, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +// +// Asm: KMOVW, CPU Features: AVX512 +func (x Mask16x32) ToBits() uint32 + // Mask32x16 is a 512-bit SIMD vector of 16 int32 type Mask32x16 struct { int32x16 v512 @@ -1246,9 +1306,15 @@ func (x Mask32x16) StoreToBits(y *uint64) // Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 16 bits of y are used. // -// Asm: KMOVD, CPU Feature: AVX512" +// Asm: KMOVD, CPU Feature: AVX512 func Mask32x16FromBits(y uint16) Mask32x16 +// ToBits constructs a bitmap from a Mask32x16, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +// Asm: KMOVD, CPU Features: AVX512 +func (x Mask32x16) ToBits() uint16 + // Mask64x8 is a 512-bit SIMD vector of 8 int64 type Mask64x8 struct { int64x8 v512 @@ -1274,5 +1340,11 @@ func (x Mask64x8) StoreToBits(y *uint64) // Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset. // Only the lower 8 bits of y are used. // -// Asm: KMOVQ, CPU Feature: AVX512" +// Asm: KMOVQ, CPU Feature: AVX512 func Mask64x8FromBits(y uint8) Mask64x8 + +// ToBits constructs a bitmap from a Mask64x8, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +// Asm: KMOVQ, CPU Features: AVX512 +func (x Mask64x8) ToBits() uint8