diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 9c31b77e703..0fafd69f54b 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1461,13 +1461,13 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.AddRestSourceReg(simdReg(v.Args[1])) p.To.Type = obj.TYPE_REG p.To.Reg = simdReg(v) - case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512: + case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, ssa.OpAMD64KMOVQload: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = v.Args[0].Reg() ssagen.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG - p.To.Reg = simdReg(v) + p.To.Reg = simdOrMaskReg(v) case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 2972eae87d5..bb7513795d9 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1682,6 +1682,22 @@ (Select0 a:(ADD(Q|L)constflags [c] x)) && a.Uses == 1 => (ADD(Q|L)const [c] x) // XXX SIMD +(LoadMask8x16 ptr mem) => (VPMOVMToVec8x16 (KMOVQload ptr mem)) +(LoadMask8x32 ptr mem) => (VPMOVMToVec8x32 (KMOVQload ptr mem)) +(LoadMask8x64 ptr mem) => (VPMOVMToVec8x64 (KMOVQload ptr mem)) + +(LoadMask16x8 ptr mem) => (VPMOVMToVec16x8 (KMOVQload ptr mem)) +(LoadMask16x16 ptr mem) => (VPMOVMToVec16x16 (KMOVQload ptr mem)) +(LoadMask16x32 ptr mem) => (VPMOVMToVec16x32 (KMOVQload ptr mem)) + +(LoadMask32x4 ptr mem) => (VPMOVMToVec32x4 (KMOVQload ptr mem)) +(LoadMask32x8 ptr mem) => (VPMOVMToVec32x8 (KMOVQload ptr mem)) +(LoadMask32x16 ptr mem) => (VPMOVMToVec32x16 (KMOVQload ptr mem)) + +(LoadMask64x2 ptr mem) => (VPMOVMToVec64x2 (KMOVQload ptr mem)) +(LoadMask64x4 ptr mem) => (VPMOVMToVec64x4 (KMOVQload ptr mem)) +(LoadMask64x8 ptr mem) => (VPMOVMToVec64x8 (KMOVQload ptr mem)) + (Load ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem) (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 543233f4d83..ec335f67f87 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -234,6 +234,8 @@ func init() { wfpw = regInfo{inputs: []regMask{w, fp}, outputs: wonly} wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly} + kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} + prefreg = regInfo{inputs: []regMask{gpspsbg}} ) @@ -1314,6 +1316,8 @@ func init() { {name: "VZEROUPPER", argLength: 0, asm: "VZEROUPPER"}, {name: "VZEROALL", argLength: 0, asm: "VZEROALL"}, + + {name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, } var AMD64blocks = []blockData{ diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index 2d44cc85f82..6257396a6f5 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -666,6 +666,18 @@ var genericOps = []opData{ // XXX SIMD {name: "Add32x4", argLength: 2}, // arg0 + arg1 {name: "ZeroSIMD", argLength: 0}, + {name: "LoadMask8x16", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask8x32", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask8x64", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask16x8", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask16x16", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask16x32", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask32x4", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask32x8", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask32x16", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask64x2", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask64x4", argLength: 2}, // arg0 = ptr, arg1 = mem + {name: "LoadMask64x8", argLength: 2}, // arg0 = ptr, arg1 = mem } // kind controls successors implicit exit diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 29058f0b193..d69e714082b 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1198,6 +1198,7 @@ const ( OpAMD64Zero512 OpAMD64VZEROUPPER OpAMD64VZEROALL + OpAMD64KMOVQload OpAMD64VADDPS512 OpAMD64VADDPSMasked512 OpAMD64VRCP14PS512 @@ -4403,6 +4404,18 @@ const ( OpPrefetchCacheStreamed OpAdd32x4 OpZeroSIMD + OpLoadMask8x16 + OpLoadMask8x32 + OpLoadMask8x64 + OpLoadMask16x8 + OpLoadMask16x16 + OpLoadMask16x32 + OpLoadMask32x4 + OpLoadMask32x8 + OpLoadMask32x16 + OpLoadMask64x2 + OpLoadMask64x4 + OpLoadMask64x8 OpAddFloat32x16 OpAddMaskedFloat32x16 OpApproximateReciprocalFloat32x16 @@ -18801,6 +18814,22 @@ var opcodeTable = [...]opInfo{ asm: x86.AVZEROALL, reg: regInfo{}, }, + { + name: "KMOVQload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AKMOVQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "VADDPS512", argLen: 2, @@ -60727,6 +60756,66 @@ var opcodeTable = [...]opInfo{ argLen: 0, generic: true, }, + { + name: "LoadMask8x16", + argLen: 2, + generic: true, + }, + { + name: "LoadMask8x32", + argLen: 2, + generic: true, + }, + { + name: "LoadMask8x64", + argLen: 2, + generic: true, + }, + { + name: "LoadMask16x8", + argLen: 2, + generic: true, + }, + { + name: "LoadMask16x16", + argLen: 2, + generic: true, + }, + { + name: "LoadMask16x32", + argLen: 2, + generic: true, + }, + { + name: "LoadMask32x4", + argLen: 2, + generic: true, + }, + { + name: "LoadMask32x8", + argLen: 2, + generic: true, + }, + { + name: "LoadMask32x16", + argLen: 2, + generic: true, + }, + { + name: "LoadMask64x2", + argLen: 2, + generic: true, + }, + { + name: "LoadMask64x4", + argLen: 2, + generic: true, + }, + { + name: "LoadMask64x8", + argLen: 2, + generic: true, + }, { name: "AddFloat32x16", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 5c7cafd6f23..0ff19a680e4 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2438,6 +2438,30 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpLessUint8x64(v) case OpLoad: return rewriteValueAMD64_OpLoad(v) + case OpLoadMask16x16: + return rewriteValueAMD64_OpLoadMask16x16(v) + case OpLoadMask16x32: + return rewriteValueAMD64_OpLoadMask16x32(v) + case OpLoadMask16x8: + return rewriteValueAMD64_OpLoadMask16x8(v) + case OpLoadMask32x16: + return rewriteValueAMD64_OpLoadMask32x16(v) + case OpLoadMask32x4: + return rewriteValueAMD64_OpLoadMask32x4(v) + case OpLoadMask32x8: + return rewriteValueAMD64_OpLoadMask32x8(v) + case OpLoadMask64x2: + return rewriteValueAMD64_OpLoadMask64x2(v) + case OpLoadMask64x4: + return rewriteValueAMD64_OpLoadMask64x4(v) + case OpLoadMask64x8: + return rewriteValueAMD64_OpLoadMask64x8(v) + case OpLoadMask8x16: + return rewriteValueAMD64_OpLoadMask8x16(v) + case OpLoadMask8x32: + return rewriteValueAMD64_OpLoadMask8x32(v) + case OpLoadMask8x64: + return rewriteValueAMD64_OpLoadMask8x64(v) case OpLocalAddr: return rewriteValueAMD64_OpLocalAddr(v) case OpLsh16x16: @@ -40303,6 +40327,222 @@ func rewriteValueAMD64_OpLoad(v *Value) bool { } return false } +func rewriteValueAMD64_OpLoadMask16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask16x16 ptr mem) + // result: (VPMOVMToVec16x16 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec16x16) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask16x32 ptr mem) + // result: (VPMOVMToVec16x32 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec16x32) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask16x8 ptr mem) + // result: (VPMOVMToVec16x8 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec16x8) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask32x16 ptr mem) + // result: (VPMOVMToVec32x16 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec32x16) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask32x4 ptr mem) + // result: (VPMOVMToVec32x4 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec32x4) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask32x8 ptr mem) + // result: (VPMOVMToVec32x8 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec32x8) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask64x2 ptr mem) + // result: (VPMOVMToVec64x2 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec64x2) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask64x4 ptr mem) + // result: (VPMOVMToVec64x4 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec64x4) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask64x8 ptr mem) + // result: (VPMOVMToVec64x8 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec64x8) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask8x16 ptr mem) + // result: (VPMOVMToVec8x16 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec8x16) + v.Type = types.TypeVec128 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask8x32 ptr mem) + // result: (VPMOVMToVec8x32 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec8x32) + v.Type = types.TypeVec256 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (LoadMask8x64 ptr mem) + // result: (VPMOVMToVec8x64 (KMOVQload ptr mem)) + for { + t := v.Type + ptr := v_0 + mem := v_1 + v.reset(OpAMD64VPMOVMToVec8x64) + v.Type = types.TypeVec512 + v0 := b.NewValue0(v.Pos, OpAMD64KMOVQload, t) + v0.AddArg2(ptr, mem) + v.AddArg(v0) + return true + } +} func rewriteValueAMD64_OpLocalAddr(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 5415143ec31..e012b536b55 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1775,6 +1775,22 @@ func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { } } +func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + opCodes := map[int]map[int]ssa.Op{ + 8: {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64}, + 16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32}, + 32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16}, + 64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8}, + } + op := opCodes[elemBits][lanes] + if op == 0 { + panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes)) + } + return s.newValue2(op, types.TypeMask, args[0], s.mem()) + } +} + // findIntrinsic returns a function which builds the SSA equivalent of the // function identified by the symbol sym. If sym is not an intrinsic call, returns nil. func findIntrinsic(sym *types.Sym) intrinsicBuilder { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 3d929499085..8040a187bda 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -2132,76 +2132,64 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x4.Store", simdStore(), sys.AMD64) addF(simdPackage, "LoadUint64x8", simdLoad(), sys.AMD64) addF(simdPackage, "Uint64x8.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask8x16", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask8x16.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask8x32", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask8x32.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask8x64", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask8x64.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask16x8", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask16x8.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask16x16", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask16x16.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask16x32", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask16x32.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask32x4", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask32x4.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask32x8", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask32x8.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask32x16", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask32x16.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask64x2", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask64x2.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask64x4", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask64x4.Store", simdStore(), sys.AMD64) - addF(simdPackage, "LoadMask64x8", simdLoad(), sys.AMD64) - addF(simdPackage, "Mask64x8.Store", simdStore(), sys.AMD64) addF(simdPackage, "Mask8x16.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x16.AsMask8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64) addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64) addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64) addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64) addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64) addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64) addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64) addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64) addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64) addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64) addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64) addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64) } diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 14e5fe31794..276ae9ed5d6 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -460,3 +460,20 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si } } } + +func TestBitMask(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + var bits uint64 = 0b10 + results := [2]int64{} + want := [2]int64{0, 6} + m := simd.LoadMask64x2FromBits(&bits) + simd.LoadInt64x2Slice([]int64{1, 2}).AddMasked(simd.LoadInt64x2Slice([]int64{3, 4}), m).Store(&results) + for i := range 2 { + if results[i] != want[i] { + t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i]) + } + } +} diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go index 6cc79275767..ccc8427bb3e 100644 --- a/src/simd/types_amd64.go +++ b/src/simd/types_amd64.go @@ -205,24 +205,48 @@ type Mask8x16 struct { vals [16]int8 } +// Mask8x16FromBits constructs a Mask8x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +//go:noescape +func LoadMask8x16FromBits(y *uint64) Mask8x16 + // Mask16x8 is a 128-bit SIMD vector of 8 int16 type Mask16x8 struct { int16x8 v128 vals [8]int16 } +// Mask16x8FromBits constructs a Mask16x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +//go:noescape +func LoadMask16x8FromBits(y *uint64) Mask16x8 + // Mask32x4 is a 128-bit SIMD vector of 4 int32 type Mask32x4 struct { int32x4 v128 vals [4]int32 } +// Mask32x4FromBits constructs a Mask32x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +// +//go:noescape +func LoadMask32x4FromBits(y *uint64) Mask32x4 + // Mask64x2 is a 128-bit SIMD vector of 2 int64 type Mask64x2 struct { int64x2 v128 vals [2]int64 } +// Mask64x2FromBits constructs a Mask64x2 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 2 bits of y are used. +// +//go:noescape +func LoadMask64x2FromBits(y *uint64) Mask64x2 + // v256 is a tag type that tells the compiler that this is really 256-bit SIMD type v256 struct { _256 struct{} @@ -424,24 +448,48 @@ type Mask8x32 struct { vals [32]int8 } +// Mask8x32FromBits constructs a Mask8x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +// +//go:noescape +func LoadMask8x32FromBits(y *uint64) Mask8x32 + // Mask16x16 is a 256-bit SIMD vector of 16 int16 type Mask16x16 struct { int16x16 v256 vals [16]int16 } +// Mask16x16FromBits constructs a Mask16x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +//go:noescape +func LoadMask16x16FromBits(y *uint64) Mask16x16 + // Mask32x8 is a 256-bit SIMD vector of 8 int32 type Mask32x8 struct { int32x8 v256 vals [8]int32 } +// Mask32x8FromBits constructs a Mask32x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +//go:noescape +func LoadMask32x8FromBits(y *uint64) Mask32x8 + // Mask64x4 is a 256-bit SIMD vector of 4 int64 type Mask64x4 struct { int64x4 v256 vals [4]int64 } +// Mask64x4FromBits constructs a Mask64x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 4 bits of y are used. +// +//go:noescape +func LoadMask64x4FromBits(y *uint64) Mask64x4 + // v512 is a tag type that tells the compiler that this is really 512-bit SIMD type v512 struct { _512 struct{} @@ -643,20 +691,44 @@ type Mask8x64 struct { vals [64]int8 } +// Mask8x64FromBits constructs a Mask8x64 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 64 bits of y are used. +// +//go:noescape +func LoadMask8x64FromBits(y *uint64) Mask8x64 + // Mask16x32 is a 512-bit SIMD vector of 32 int16 type Mask16x32 struct { int16x32 v512 vals [32]int16 } +// Mask16x32FromBits constructs a Mask16x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 32 bits of y are used. +// +//go:noescape +func LoadMask16x32FromBits(y *uint64) Mask16x32 + // Mask32x16 is a 512-bit SIMD vector of 16 int32 type Mask32x16 struct { int32x16 v512 vals [16]int32 } +// Mask32x16FromBits constructs a Mask32x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 16 bits of y are used. +// +//go:noescape +func LoadMask32x16FromBits(y *uint64) Mask32x16 + // Mask64x8 is a 512-bit SIMD vector of 8 int64 type Mask64x8 struct { int64x8 v512 vals [8]int64 } + +// Mask64x8FromBits constructs a Mask64x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. +// Only the lower 8 bits of y are used. +// +//go:noescape +func LoadMask64x8FromBits(y *uint64) Mask64x8