diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go index c917d14298d..3959f8a7c11 100644 --- a/src/cmd/compile/internal/loong64/ssa.go +++ b/src/cmd/compile/internal/loong64/ssa.go @@ -659,42 +659,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Sym = ir.Syms.Duffcopy p.To.Offset = v.AuxInt case ssa.OpLOONG64LoweredMove: - // MOVx (Rarg1), Rtmp - // MOVx Rtmp, (Rarg0) - // ADDV $sz, Rarg1 - // ADDV $sz, Rarg0 - // BGEU Rarg2, Rarg0, -4(PC) - mov, sz := largestMove(v.AuxInt) - p := s.Prog(mov) - p.From.Type = obj.TYPE_MEM - p.From.Reg = v.Args[1].Reg() + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + if dstReg == srcReg { + break + } + tmpReg := int16(loong64.REG_R20) + n := v.AuxInt + if n < 16 { + v.Fatalf("Move too small %d", n) + } + + var off int64 + for n >= 8 { + // MOVV off(srcReg), tmpReg + // MOVV tmpReg, off(dstReg) + move8(s, srcReg, dstReg, tmpReg, off) + off += 8 + n -= 8 + } + + if n != 0 { + // MOVV off+n-8(srcReg), tmpReg + // MOVV tmpReg, off+n-8(srcReg) + move8(s, srcReg, dstReg, tmpReg, off+n-8) + } + case ssa.OpLOONG64LoweredMoveLoop: + dstReg := v.Args[0].Reg() + srcReg := v.Args[1].Reg() + if dstReg == srcReg { + break + } + countReg := int16(loong64.REG_R20) + tmpReg := int16(loong64.REG_R21) + var off int64 + n := v.AuxInt + loopSize := int64(64) + if n < 3*loopSize { + // - a loop count of 0 won't work. + // - a loop count of 1 is useless. + // - a loop count of 2 is a code size ~tie + // 4 instructions to implement the loop + // 8 instructions in the loop body + // vs + // 16 instructions in the straightline code + // Might as well use straightline code. + v.Fatalf("ZeroLoop size too small %d", n) + } + + // Put iteration count in a register. + // MOVV $n/loopSize, countReg + p := s.Prog(loong64.AMOVV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = n / loopSize p.To.Type = obj.TYPE_REG - p.To.Reg = loong64.REGTMP + p.To.Reg = countReg + cntInit := p - p2 := s.Prog(mov) - p2.From.Type = obj.TYPE_REG - p2.From.Reg = loong64.REGTMP - p2.To.Type = obj.TYPE_MEM - p2.To.Reg = v.Args[0].Reg() + // Move loopSize bytes starting at srcReg to dstReg. + for range loopSize / 8 { + // MOVV off(srcReg), tmpReg + // MOVV tmpReg, off(dstReg) + move8(s, srcReg, dstReg, tmpReg, off) + off += 8 + } - p3 := s.Prog(loong64.AADDVU) - p3.From.Type = obj.TYPE_CONST - p3.From.Offset = sz - p3.To.Type = obj.TYPE_REG - p3.To.Reg = v.Args[1].Reg() + // Increment srcReg and destReg by loopSize. + // ADDV $loopSize, srcReg + p = s.Prog(loong64.AADDV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = srcReg + // ADDV $loopSize, dstReg + p = s.Prog(loong64.AADDV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = loopSize + p.To.Type = obj.TYPE_REG + p.To.Reg = dstReg - p4 := s.Prog(loong64.AADDVU) - p4.From.Type = obj.TYPE_CONST - p4.From.Offset = sz - p4.To.Type = obj.TYPE_REG - p4.To.Reg = v.Args[0].Reg() + // Decrement loop count. + // SUBV $1, countReg + p = s.Prog(loong64.ASUBV) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 1 + p.To.Type = obj.TYPE_REG + p.To.Reg = countReg - p5 := s.Prog(loong64.ABGEU) - p5.From.Type = obj.TYPE_REG - p5.From.Reg = v.Args[2].Reg() - p5.Reg = v.Args[1].Reg() - p5.To.Type = obj.TYPE_BRANCH - p5.To.SetTarget(p) + // Jump to loop header if we're not done yet. + // BNE countReg, loop header + p = s.Prog(loong64.ABNE) + p.From.Type = obj.TYPE_REG + p.From.Reg = countReg + p.To.Type = obj.TYPE_BRANCH + p.To.SetTarget(cntInit.Link) + + // Multiples of the loop size are now done. + n %= loopSize + + off = 0 + // Copy any fractional portion. + for n >= 8 { + // MOVV off(srcReg), tmpReg + // MOVV tmpReg, off(dstReg) + move8(s, srcReg, dstReg, tmpReg, off) + off += 8 + n -= 8 + } + + if n != 0 { + // MOVV off+n-8(srcReg), tmpReg + // MOVV tmpReg, off+n-8(srcReg) + move8(s, srcReg, dstReg, tmpReg, off+n-8) + } case ssa.OpLOONG64CALLstatic, ssa.OpLOONG64CALLclosure, ssa.OpLOONG64CALLinter: s.Call(v) @@ -1225,6 +1302,24 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in return p } +// move8 copies 8 bytes at src+off to dst+off. +func move8(s *ssagen.State, src, dst, tmp int16, off int64) { + // MOVV off(src), tmp + ld := s.Prog(loong64.AMOVV) + ld.From.Type = obj.TYPE_MEM + ld.From.Reg = src + ld.From.Offset = off + ld.To.Type = obj.TYPE_REG + ld.To.Reg = tmp + // MOVV tmp, off(dst) + st := s.Prog(loong64.AMOVV) + st.From.Type = obj.TYPE_REG + st.From.Reg = tmp + st.To.Type = obj.TYPE_MEM + st.To.Reg = dst + st.To.Offset = off +} + // zero8 zeroes 8 bytes at reg+off. func zero8(s *ssagen.State, reg int16, off int64) { // MOVV ZR, off(reg) diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules index cb55c16c3e7..3fa4f363f65 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules @@ -419,34 +419,8 @@ (MOVVstore [8] dst (MOVVload [8] src mem) (MOVVstore dst (MOVVload src mem) mem)) -// strip off fractional word move -(Move [s] dst src mem) && s%8 != 0 && s > 16 => - (Move [s%8] - (OffPtr dst [s-s%8]) - (OffPtr src [s-s%8]) - (Move [s-s%8] dst src mem)) - -// medium move uses a duff device -(Move [s] dst src mem) - && s%8 == 0 && s > 16 && s <= 8*128 - && logLargeCopy(v, s) => - (DUFFCOPY [16 * (128 - s/8)] dst src mem) -// 16 and 128 are magic constants. 16 is the number of bytes to encode: -// MOVV (R20), R30 -// ADDV $8, R20 -// MOVV R30, (R21) -// ADDV $8, R21 -// and 128 is the number of such blocks. See runtime/duff_loong64.s:duffcopy. - -// large move uses a loop -(Move [s] dst src mem) - && s%8 == 0 && s > 1024 && logLargeCopy(v, s) => - (LoweredMove - dst - src - (ADDVconst src [s-8]) - mem) - +(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem) +(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem) // float <=> int register moves, with no conversion. // These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}. diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go index 359cb42056a..cc6ae8fb8e6 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go @@ -429,27 +429,40 @@ func init() { needIntTemp: true, }, - // large or unaligned move - // arg0 = address of dst memory (in R21, changed as side effect) - // arg1 = address of src memory (in R20, changed as side effect) - // arg2 = address of the last element of src - // arg3 = mem - // auxint = alignment + // medium copying + // arg0 = address of dst memory + // arg1 = address of src memory + // arg2 = mem + // auxint = number of bytes to copy // returns mem - // MOVx (R20), Rtmp - // MOVx Rtmp, (R21) - // ADDV $sz, R20 - // ADDV $sz, R21 - // BGEU Rarg2, R20, -4(PC) { name: "LoweredMove", aux: "Int64", - argLength: 4, + argLength: 3, reg: regInfo{ - inputs: []regMask{buildReg("R21"), buildReg("R20"), gp}, - clobbers: buildReg("R20 R21"), + inputs: []regMask{gp &^ buildReg("R20"), gp &^ buildReg("R20")}, + clobbers: buildReg("R20"), + }, + faultOnNilArg0: true, + faultOnNilArg1: true, + }, + + // large copying + // arg0 = address of dst memory + // arg1 = address of src memory + // arg2 = mem + // auxint = number of bytes to copy + // returns mem + { + name: "LoweredMoveLoop", + aux: "Int64", + argLength: 3, + reg: regInfo{ + inputs: []regMask{gp &^ buildReg("R20 R21"), gp &^ buildReg("R20 R21")}, + clobbers: buildReg("R20 R21"), + clobbersArg0: true, + clobbersArg1: true, }, - typ: "Mem", faultOnNilArg0: true, faultOnNilArg1: true, }, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index fca7d810175..f42d64228fa 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1929,6 +1929,7 @@ const ( OpLOONG64DUFFCOPY OpLOONG64LoweredZeroLoop OpLOONG64LoweredMove + OpLOONG64LoweredMoveLoop OpLOONG64LoweredAtomicLoad8 OpLOONG64LoweredAtomicLoad32 OpLOONG64LoweredAtomicLoad64 @@ -25986,16 +25987,31 @@ var opcodeTable = [...]opInfo{ { name: "LoweredMove", auxType: auxInt64, - argLen: 4, + argLen: 3, faultOnNilArg0: true, faultOnNilArg1: true, reg: regInfo{ inputs: []inputInfo{ - {0, 1048576}, // R21 - {1, 524288}, // R20 - {2, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 + {0, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31 + {1, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31 }, - clobbers: 1572864, // R20 R21 + clobbers: 524288, // R20 + }, + }, + { + name: "LoweredMoveLoop", + auxType: auxInt64, + argLen: 3, + faultOnNilArg0: true, + faultOnNilArg1: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31 + {1, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31 + }, + clobbers: 1572864, // R20 R21 + clobbersArg0: true, + clobbersArg1: true, }, }, { diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go index ae3358e5e51..5890fe050a2 100644 --- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go +++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go @@ -9133,62 +9133,35 @@ func rewriteValueLOONG64_OpMove(v *Value) bool { return true } // match: (Move [s] dst src mem) - // cond: s%8 != 0 && s > 16 - // result: (Move [s%8] (OffPtr dst [s-s%8]) (OffPtr src [s-s%8]) (Move [s-s%8] dst src mem)) + // cond: s > 16 && s < 192 && logLargeCopy(v, s) + // result: (LoweredMove [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s%8 != 0 && s > 16) { + if !(s > 16 && s < 192 && logLargeCopy(v, s)) { break } - v.reset(OpMove) - v.AuxInt = int64ToAuxInt(s % 8) - v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) - v0.AuxInt = int64ToAuxInt(s - s%8) - v0.AddArg(dst) - v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) - v1.AuxInt = int64ToAuxInt(s - s%8) - v1.AddArg(src) - v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem) - v2.AuxInt = int64ToAuxInt(s - s%8) - v2.AddArg3(dst, src, mem) - v.AddArg3(v0, v1, v2) - return true - } - // match: (Move [s] dst src mem) - // cond: s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s) - // result: (DUFFCOPY [16 * (128 - s/8)] dst src mem) - for { - s := auxIntToInt64(v.AuxInt) - dst := v_0 - src := v_1 - mem := v_2 - if !(s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)) { - break - } - v.reset(OpLOONG64DUFFCOPY) - v.AuxInt = int64ToAuxInt(16 * (128 - s/8)) + v.reset(OpLOONG64LoweredMove) + v.AuxInt = int64ToAuxInt(s) v.AddArg3(dst, src, mem) return true } // match: (Move [s] dst src mem) - // cond: s%8 == 0 && s > 1024 && logLargeCopy(v, s) - // result: (LoweredMove dst src (ADDVconst src [s-8]) mem) + // cond: s >= 192 && logLargeCopy(v, s) + // result: (LoweredMoveLoop [s] dst src mem) for { s := auxIntToInt64(v.AuxInt) dst := v_0 src := v_1 mem := v_2 - if !(s%8 == 0 && s > 1024 && logLargeCopy(v, s)) { + if !(s >= 192 && logLargeCopy(v, s)) { break } - v.reset(OpLOONG64LoweredMove) - v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, src.Type) - v0.AuxInt = int64ToAuxInt(s - 8) - v0.AddArg(src) - v.AddArg4(dst, src, v0, mem) + v.reset(OpLOONG64LoweredMoveLoop) + v.AuxInt = int64ToAuxInt(s) + v.AddArg3(dst, src, mem) return true } return false