diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 0159d8ec07a..25fa7b695a2 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1755,14 +1755,16 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = simdReg(v) - case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, ssa.OpAMD64KMOVQload: + case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, + ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = v.Args[0].Reg() ssagen.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = simdOrMaskReg(v) - case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore: + case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, + ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = simdOrMaskReg(v.Args[1]) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 2b448719603..30c31eb865e 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1676,6 +1676,17 @@ (CvtMask64x4to8 x) => (KMOVBi (VPMOVVec64x4ToM x)) (CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM x)) +// optimizations +(MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem) +(MOVWstore [off] {sym} ptr (KMOVWi mask) mem) => (KMOVWstore [off] {sym} ptr mask mem) +(MOVLstore [off] {sym} ptr (KMOVDi mask) mem) => (KMOVDstore [off] {sym} ptr mask mem) +(MOVQstore [off] {sym} ptr (KMOVQi mask) mem) => (KMOVQstore [off] {sym} ptr mask mem) + +(KMOVBk l:(MOVBload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVBload [off] {sym} ptr mem) +(KMOVWk l:(MOVWload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVWload [off] {sym} ptr mem) +(KMOVDk l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVDload [off] {sym} ptr mem) +(KMOVQk l:(MOVQload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVQload [off] {sym} ptr mem) + // SIMD vector loads and stores (Load ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem) (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 027b9832ac2..c92f1b8531e 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -1415,7 +1415,20 @@ func init() { {name: "VZEROUPPER", argLength: 1, reg: regInfo{clobbers: v}, asm: "VZEROUPPER"}, // arg=mem, returns mem {name: "VZEROALL", argLength: 1, reg: regInfo{clobbers: v}, asm: "VZEROALL"}, // arg=mem, returns mem + // KMOVxload: loads masks + // Load (Q=8,D=4,W=2,B=1) bytes from (arg0+auxint+aux), arg1=mem. + // "+auxint+aux" == add auxint and the offset of the symbol in aux (if any) to the effective address + {name: "KMOVBload", argLength: 2, reg: kload, asm: "KMOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "KMOVWload", argLength: 2, reg: kload, asm: "KMOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "KMOVDload", argLength: 2, reg: kload, asm: "KMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, {name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + + // KMOVxstore: stores masks + // Store (Q=8,D=4,W=2,B=1) low bytes of arg1. + // Does *(arg0+auxint+aux) = arg1, arg2=mem. + {name: "KMOVBstore", argLength: 3, reg: kstore, asm: "KMOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, + {name: "KMOVWstore", argLength: 3, reg: kstore, asm: "KMOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, + {name: "KMOVDstore", argLength: 3, reg: kstore, asm: "KMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, {name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // Move GP directly to mask register diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 08b6bffd0ef..30831e828a8 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1228,7 +1228,13 @@ const ( OpAMD64VMOVSDconst OpAMD64VZEROUPPER OpAMD64VZEROALL + OpAMD64KMOVBload + OpAMD64KMOVWload + OpAMD64KMOVDload OpAMD64KMOVQload + OpAMD64KMOVBstore + OpAMD64KMOVWstore + OpAMD64KMOVDstore OpAMD64KMOVQstore OpAMD64KMOVQk OpAMD64KMOVDk @@ -19698,6 +19704,54 @@ var opcodeTable = [...]opInfo{ clobbers: 2147418112, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, }, + { + name: "KMOVBload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AKMOVB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "KMOVWload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AKMOVW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, + { + name: "KMOVDload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AKMOVD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + }, + }, + }, { name: "KMOVQload", auxType: auxSymOff, @@ -19714,6 +19768,48 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "KMOVBstore", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AKMOVB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, + { + name: "KMOVWstore", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AKMOVW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, + { + name: "KMOVDstore", + auxType: auxSymOff, + argLen: 3, + faultOnNilArg0: true, + symEffect: SymWrite, + asm: x86.AKMOVD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + }, + }, { name: "KMOVQstore", auxType: auxSymOff, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 5220a0a73c2..908fd71b783 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -225,6 +225,14 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64HMULQ(v) case OpAMD64HMULQU: return rewriteValueAMD64_OpAMD64HMULQU(v) + case OpAMD64KMOVBk: + return rewriteValueAMD64_OpAMD64KMOVBk(v) + case OpAMD64KMOVDk: + return rewriteValueAMD64_OpAMD64KMOVDk(v) + case OpAMD64KMOVQk: + return rewriteValueAMD64_OpAMD64KMOVQk(v) + case OpAMD64KMOVWk: + return rewriteValueAMD64_OpAMD64KMOVWk(v) case OpAMD64LEAL: return rewriteValueAMD64_OpAMD64LEAL(v) case OpAMD64LEAL1: @@ -13351,6 +13359,106 @@ func rewriteValueAMD64_OpAMD64HMULQU(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64KMOVBk(v *Value) bool { + v_0 := v.Args[0] + // match: (KMOVBk l:(MOVBload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (KMOVBload [off] {sym} ptr mem) + for { + l := v_0 + if l.Op != OpAMD64MOVBload { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64KMOVBload) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64KMOVDk(v *Value) bool { + v_0 := v.Args[0] + // match: (KMOVDk l:(MOVLload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (KMOVDload [off] {sym} ptr mem) + for { + l := v_0 + if l.Op != OpAMD64MOVLload { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64KMOVDload) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64KMOVQk(v *Value) bool { + v_0 := v.Args[0] + // match: (KMOVQk l:(MOVQload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (KMOVQload [off] {sym} ptr mem) + for { + l := v_0 + if l.Op != OpAMD64MOVQload { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64KMOVQload) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64KMOVWk(v *Value) bool { + v_0 := v.Args[0] + // match: (KMOVWk l:(MOVWload [off] {sym} ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (KMOVWload [off] {sym} ptr mem) + for { + l := v_0 + if l.Op != OpAMD64MOVWload { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64KMOVWload) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + return false +} func rewriteValueAMD64_OpAMD64LEAL(v *Value) bool { v_0 := v.Args[0] // match: (LEAL [c] {s} (ADDLconst [d] x)) @@ -15447,6 +15555,23 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value) bool { v.AddArg3(base, val, mem) return true } + // match: (MOVBstore [off] {sym} ptr (KMOVBi mask) mem) + // result: (KMOVBstore [off] {sym} ptr mask mem) + for { + off := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + ptr := v_0 + if v_1.Op != OpAMD64KMOVBi { + break + } + mask := v_1.Args[0] + mem := v_2 + v.reset(OpAMD64KMOVBstore) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg3(ptr, mask, mem) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVBstoreconst(v *Value) bool { @@ -16477,6 +16602,23 @@ func rewriteValueAMD64_OpAMD64MOVLstore(v *Value) bool { v.AddArg3(p, w, mem) return true } + // match: (MOVLstore [off] {sym} ptr (KMOVDi mask) mem) + // result: (KMOVDstore [off] {sym} ptr mask mem) + for { + off := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + ptr := v_0 + if v_1.Op != OpAMD64KMOVDi { + break + } + mask := v_1.Args[0] + mem := v_2 + v.reset(OpAMD64KMOVDstore) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg3(ptr, mask, mem) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool { @@ -17460,6 +17602,23 @@ func rewriteValueAMD64_OpAMD64MOVQstore(v *Value) bool { v.AddArg3(p, w, mem) return true } + // match: (MOVQstore [off] {sym} ptr (KMOVQi mask) mem) + // result: (KMOVQstore [off] {sym} ptr mask mem) + for { + off := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + ptr := v_0 + if v_1.Op != OpAMD64KMOVQi { + break + } + mask := v_1.Args[0] + mem := v_2 + v.reset(OpAMD64KMOVQstore) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg3(ptr, mask, mem) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool { @@ -18386,6 +18545,23 @@ func rewriteValueAMD64_OpAMD64MOVWstore(v *Value) bool { v.AddArg3(p, w, mem) return true } + // match: (MOVWstore [off] {sym} ptr (KMOVWi mask) mem) + // result: (KMOVWstore [off] {sym} ptr mask mem) + for { + off := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + ptr := v_0 + if v_1.Op != OpAMD64KMOVWi { + break + } + mask := v_1.Args[0] + mem := v_2 + v.reset(OpAMD64KMOVWstore) + v.AuxInt = int32ToAuxInt(off) + v.Aux = symToAux(sym) + v.AddArg3(ptr, mask, mem) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVWstoreconst(v *Value) bool { diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 2c866ad68b3..422378eebe4 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -348,6 +348,24 @@ func TestBitMaskFromBits(t *testing.T) { } } +var maskForTestBitMaskFromBitsLoad = uint8(0b10) + +func TestBitMaskFromBitsLoad(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + results := [2]int64{} + want := [2]int64{0, 6} + m := simd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad) + simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results) + for i := range 2 { + if results[i] != want[i] { + t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i]) + } + } +} + func TestBitMaskToBits(t *testing.T) { if !simd.HasAVX512() { t.Skip("Test requires HasAVX512, not available on this hardware") @@ -358,6 +376,19 @@ func TestBitMaskToBits(t *testing.T) { } } +var maskForTestBitMaskFromBitsStore uint8 + +func TestBitMaskToBitsStore(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + maskForTestBitMaskFromBitsStore = simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits() + if maskForTestBitMaskFromBitsStore != 0b101 { + t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore) + } +} + func TestMergeFloat(t *testing.T) { k := make([]int64, 4, 4) s := make([]float64, 4, 4)