go/src/cmd/compile/internal/ssa/gen/S390X.rules

1884 lines
84 KiB
Text
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Lowering arithmetic
(Add64 x y) -> (ADD x y)
(AddPtr x y) -> (ADD x y)
(Add32 x y) -> (ADDW x y)
(Add16 x y) -> (ADDW x y)
(Add8 x y) -> (ADDW x y)
(Add32F x y) -> (FADDS x y)
(Add64F x y) -> (FADD x y)
(Sub64 x y) -> (SUB x y)
(SubPtr x y) -> (SUB x y)
(Sub32 x y) -> (SUBW x y)
(Sub16 x y) -> (SUBW x y)
(Sub8 x y) -> (SUBW x y)
(Sub32F x y) -> (FSUBS x y)
(Sub64F x y) -> (FSUB x y)
(Mul64 x y) -> (MULLD x y)
(Mul32 x y) -> (MULLW x y)
(Mul16 x y) -> (MULLW x y)
(Mul8 x y) -> (MULLW x y)
(Mul32F x y) -> (FMULS x y)
(Mul64F x y) -> (FMUL x y)
(Div32F x y) -> (FDIVS x y)
(Div64F x y) -> (FDIV x y)
(Div64 x y) -> (DIVD x y)
(Div64u x y) -> (DIVDU x y)
// DIVW/DIVWU has a 64-bit dividend and a 32-bit divisor,
// so a sign/zero extension of the dividend is required.
(Div32 x y) -> (DIVW (MOVWreg x) y)
(Div32u x y) -> (DIVWU (MOVWZreg x) y)
(Div16 x y) -> (DIVW (MOVHreg x) (MOVHreg y))
(Div16u x y) -> (DIVWU (MOVHZreg x) (MOVHZreg y))
(Div8 x y) -> (DIVW (MOVBreg x) (MOVBreg y))
(Div8u x y) -> (DIVWU (MOVBZreg x) (MOVBZreg y))
(Hmul64 x y) -> (MULHD x y)
(Hmul64u x y) -> (MULHDU x y)
(Hmul32 x y) -> (SRDconst [32] (MULLD (MOVWreg x) (MOVWreg y)))
(Hmul32u x y) -> (SRDconst [32] (MULLD (MOVWZreg x) (MOVWZreg y)))
(Mod64 x y) -> (MODD x y)
(Mod64u x y) -> (MODDU x y)
// MODW/MODWU has a 64-bit dividend and a 32-bit divisor,
// so a sign/zero extension of the dividend is required.
(Mod32 x y) -> (MODW (MOVWreg x) y)
(Mod32u x y) -> (MODWU (MOVWZreg x) y)
(Mod16 x y) -> (MODW (MOVHreg x) (MOVHreg y))
(Mod16u x y) -> (MODWU (MOVHZreg x) (MOVHZreg y))
(Mod8 x y) -> (MODW (MOVBreg x) (MOVBreg y))
(Mod8u x y) -> (MODWU (MOVBZreg x) (MOVBZreg y))
// (x + y) / 2 with x>=y -> (x - y) / 2 + y
(Avg64u <t> x y) -> (ADD (SRDconst <t> (SUB <t> x y) [1]) y)
(And64 x y) -> (AND x y)
(And32 x y) -> (ANDW x y)
(And16 x y) -> (ANDW x y)
(And8 x y) -> (ANDW x y)
(Or64 x y) -> (OR x y)
(Or32 x y) -> (ORW x y)
(Or16 x y) -> (ORW x y)
(Or8 x y) -> (ORW x y)
(Xor64 x y) -> (XOR x y)
(Xor32 x y) -> (XORW x y)
(Xor16 x y) -> (XORW x y)
(Xor8 x y) -> (XORW x y)
(Neg64 x) -> (NEG x)
(Neg32 x) -> (NEGW x)
(Neg16 x) -> (NEGW (MOVHreg x))
(Neg8 x) -> (NEGW (MOVBreg x))
(Neg32F x) -> (FNEGS x)
(Neg64F x) -> (FNEG x)
(Com64 x) -> (NOT x)
(Com32 x) -> (NOTW x)
(Com16 x) -> (NOTW x)
(Com8 x) -> (NOTW x)
(NOT x) && true -> (XOR (MOVDconst [-1]) x)
(NOTW x) && true -> (XORWconst [-1] x)
// Lowering boolean ops
(AndB x y) -> (ANDW x y)
(OrB x y) -> (ORW x y)
(Not x) -> (XORWconst [1] x)
// Lowering pointer arithmetic
(OffPtr [off] ptr:(SP)) -> (MOVDaddr [off] ptr)
(OffPtr [off] ptr) && is32Bit(off) -> (ADDconst [off] ptr)
(OffPtr [off] ptr) -> (ADD (MOVDconst [off]) ptr)
// Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
(Ctz64 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
(Ctz32 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
(BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
(Bswap64 x) -> (MOVDBR x)
(Bswap32 x) -> (MOVWBR x)
(Sqrt x) -> (FSQRT x)
// Atomic loads.
(AtomicLoad32 ptr mem) -> (MOVWZatomicload ptr mem)
(AtomicLoad64 ptr mem) -> (MOVDatomicload ptr mem)
(AtomicLoadPtr ptr mem) -> (MOVDatomicload ptr mem)
// Atomic stores.
(AtomicStore32 ptr val mem) -> (MOVWatomicstore ptr val mem)
(AtomicStore64 ptr val mem) -> (MOVDatomicstore ptr val mem)
(AtomicStorePtrNoWB ptr val mem) -> (MOVDatomicstore ptr val mem)
// Atomic adds.
(AtomicAdd32 ptr val mem) -> (AddTupleFirst32 (LAA ptr val mem) val)
(AtomicAdd64 ptr val mem) -> (AddTupleFirst64 (LAAG ptr val mem) val)
(Select0 <t> (AddTupleFirst32 tuple val)) -> (ADDW val (Select0 <t> tuple))
(Select1 (AddTupleFirst32 tuple _ )) -> (Select1 tuple)
(Select0 <t> (AddTupleFirst64 tuple val)) -> (ADD val (Select0 <t> tuple))
(Select1 (AddTupleFirst64 tuple _ )) -> (Select1 tuple)
// Atomic exchanges.
(AtomicExchange32 ptr val mem) -> (LoweredAtomicExchange32 ptr val mem)
(AtomicExchange64 ptr val mem) -> (LoweredAtomicExchange64 ptr val mem)
// Atomic compare and swap.
(AtomicCompareAndSwap32 ptr old new_ mem) -> (LoweredAtomicCas32 ptr old new_ mem)
(AtomicCompareAndSwap64 ptr old new_ mem) -> (LoweredAtomicCas64 ptr old new_ mem)
// Lowering extension
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
(SignExt8to16 x) -> (MOVBreg x)
(SignExt8to32 x) -> (MOVBreg x)
(SignExt8to64 x) -> (MOVBreg x)
(SignExt16to32 x) -> (MOVHreg x)
(SignExt16to64 x) -> (MOVHreg x)
(SignExt32to64 x) -> (MOVWreg x)
(ZeroExt8to16 x) -> (MOVBZreg x)
(ZeroExt8to32 x) -> (MOVBZreg x)
(ZeroExt8to64 x) -> (MOVBZreg x)
(ZeroExt16to32 x) -> (MOVHZreg x)
(ZeroExt16to64 x) -> (MOVHZreg x)
(ZeroExt32to64 x) -> (MOVWZreg x)
(Slicemask <t> x) -> (SRADconst (NEG <t> x) [63])
// Lowering truncation
// Because we ignore high parts of registers, truncates are just copies.
(Trunc16to8 x) -> x
(Trunc32to8 x) -> x
(Trunc32to16 x) -> x
(Trunc64to8 x) -> x
(Trunc64to16 x) -> x
(Trunc64to32 x) -> x
// Lowering float <-> int
(Cvt32to32F x) -> (CEFBRA x)
(Cvt32to64F x) -> (CDFBRA x)
(Cvt64to32F x) -> (CEGBRA x)
(Cvt64to64F x) -> (CDGBRA x)
(Cvt32Fto32 x) -> (CFEBRA x)
(Cvt32Fto64 x) -> (CGEBRA x)
(Cvt64Fto32 x) -> (CFDBRA x)
(Cvt64Fto64 x) -> (CGDBRA x)
(Cvt32Fto64F x) -> (LDEBR x)
(Cvt64Fto32F x) -> (LEDBR x)
(Round32F x) -> (LoweredRound32F x)
(Round64F x) -> (LoweredRound64F x)
// Lowering shifts
// Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
// result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
(Lsh64x64 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPUconst y [63])))
(Lsh64x32 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPWUconst y [63])))
(Lsh64x16 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVHZreg y) [63])))
(Lsh64x8 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVBZreg y) [63])))
(Lsh32x64 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
(Lsh32x32 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
(Lsh32x16 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
(Lsh32x8 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
(Lsh16x64 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
(Lsh16x32 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
(Lsh16x16 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
(Lsh16x8 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
(Lsh8x64 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
(Lsh8x32 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
(Lsh8x16 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
(Lsh8x8 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
(Rsh64Ux64 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPUconst y [63])))
(Rsh64Ux32 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPWUconst y [63])))
(Rsh64Ux16 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVHZreg y) [63])))
(Rsh64Ux8 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVBZreg y) [63])))
(Rsh32Ux64 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
(Rsh32Ux32 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
(Rsh32Ux16 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
(Rsh32Ux8 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
(Rsh16Ux64 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPUconst y [15])))
(Rsh16Ux32 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPWUconst y [15])))
(Rsh16Ux16 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [15])))
(Rsh16Ux8 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [15])))
(Rsh8Ux64 <t> x y) -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPUconst y [7])))
(Rsh8Ux32 <t> x y) -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPWUconst y [7])))
(Rsh8Ux16 <t> x y) -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [7])))
(Rsh8Ux8 <t> x y) -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [7])))
// Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
(Rsh64x64 <t> x y) -> (SRAD <t> x (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [63])))))
(Rsh64x32 <t> x y) -> (SRAD <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [63])))))
(Rsh64x16 <t> x y) -> (SRAD <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [63])))))
(Rsh64x8 <t> x y) -> (SRAD <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [63])))))
(Rsh32x64 <t> x y) -> (SRAW <t> x (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [31])))))
(Rsh32x32 <t> x y) -> (SRAW <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [31])))))
(Rsh32x16 <t> x y) -> (SRAW <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [31])))))
(Rsh32x8 <t> x y) -> (SRAW <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [31])))))
(Rsh16x64 <t> x y) -> (SRAW <t> (MOVHreg x) (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [15])))))
(Rsh16x32 <t> x y) -> (SRAW <t> (MOVHreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [15])))))
(Rsh16x16 <t> x y) -> (SRAW <t> (MOVHreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [15])))))
(Rsh16x8 <t> x y) -> (SRAW <t> (MOVHreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [15])))))
(Rsh8x64 <t> x y) -> (SRAW <t> (MOVBreg x) (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [7])))))
(Rsh8x32 <t> x y) -> (SRAW <t> (MOVBreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [7])))))
(Rsh8x16 <t> x y) -> (SRAW <t> (MOVBreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [7])))))
(Rsh8x8 <t> x y) -> (SRAW <t> (MOVBreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [7])))))
// Lowering comparisons
(Less64 x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Less32 x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
(Less16 x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVHreg x) (MOVHreg y)))
(Less8 x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(Less64U x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
(Less32U x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
(Less16U x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVHZreg x) (MOVHZreg y)))
(Less8U x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVBZreg x) (MOVBZreg y)))
// Use SETG with reversed operands to dodge NaN case.
(Less64F x y) -> (MOVDGTnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMP y x))
(Less32F x y) -> (MOVDGTnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMPS y x))
(Leq64 x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Leq32 x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
(Leq16 x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVHreg x) (MOVHreg y)))
(Leq8 x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(Leq64U x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
(Leq32U x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
(Leq16U x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVHZreg x) (MOVHZreg y)))
(Leq8U x y) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVBZreg x) (MOVBZreg y)))
// Use SETGE with reversed operands to dodge NaN case.
(Leq64F x y) -> (MOVDGEnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMP y x))
(Leq32F x y) -> (MOVDGEnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMPS y x))
(Greater64 x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Greater32 x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
(Greater16 x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVHreg x) (MOVHreg y)))
(Greater8 x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(Greater64U x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
(Greater32U x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
(Greater16U x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVHZreg x) (MOVHZreg y)))
(Greater8U x y) -> (MOVDGT (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVBZreg x) (MOVBZreg y)))
(Greater64F x y) -> (MOVDGTnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
(Greater32F x y) -> (MOVDGTnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
(Geq64 x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Geq32 x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
(Geq16 x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVHreg x) (MOVHreg y)))
(Geq8 x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(Geq64U x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMPU x y))
(Geq32U x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMPWU x y))
(Geq16U x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVHZreg x) (MOVHZreg y)))
(Geq8U x y) -> (MOVDGE (MOVDconst [0]) (MOVDconst [1]) (CMPU (MOVBZreg x) (MOVBZreg y)))
(Geq64F x y) -> (MOVDGEnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
(Geq32F x y) -> (MOVDGEnoinv (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
(Eq64 x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Eq32 x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
(Eq16 x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVHreg x) (MOVHreg y)))
(Eq8 x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(EqB x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(EqPtr x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Eq64F x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
(Eq32F x y) -> (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
(Neq64 x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Neq32 x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMPW x y))
(Neq16 x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVHreg x) (MOVHreg y)))
(Neq8 x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(NeqB x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMP (MOVBreg x) (MOVBreg y)))
(NeqPtr x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
(Neq64F x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (FCMP x y))
(Neq32F x y) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (FCMPS x y))
// Lowering loads
(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVDload ptr mem)
(Load <t> ptr mem) && is32BitInt(t) && isSigned(t) -> (MOVWload ptr mem)
(Load <t> ptr mem) && is32BitInt(t) && !isSigned(t) -> (MOVWZload ptr mem)
(Load <t> ptr mem) && is16BitInt(t) && isSigned(t) -> (MOVHload ptr mem)
(Load <t> ptr mem) && is16BitInt(t) && !isSigned(t) -> (MOVHZload ptr mem)
(Load <t> ptr mem) && is8BitInt(t) && isSigned(t) -> (MOVBload ptr mem)
(Load <t> ptr mem) && (t.IsBoolean() || (is8BitInt(t) && !isSigned(t))) -> (MOVBZload ptr mem)
(Load <t> ptr mem) && is32BitFloat(t) -> (FMOVSload ptr mem)
(Load <t> ptr mem) && is64BitFloat(t) -> (FMOVDload ptr mem)
// Lowering stores
// These more-specific FP versions of Store pattern should come first.
(Store {t} ptr val mem) && t.(Type).Size() == 8 && is64BitFloat(val.Type) -> (FMOVDstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 4 && is32BitFloat(val.Type) -> (FMOVSstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 8 -> (MOVDstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 4 -> (MOVWstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 2 -> (MOVHstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 1 -> (MOVBstore ptr val mem)
// Lowering moves
// Load and store for small copies.
(Move [0] _ _ mem) -> mem
(Move [1] dst src mem) -> (MOVBstore dst (MOVBZload src mem) mem)
(Move [2] dst src mem) -> (MOVHstore dst (MOVHZload src mem) mem)
(Move [4] dst src mem) -> (MOVWstore dst (MOVWZload src mem) mem)
(Move [8] dst src mem) -> (MOVDstore dst (MOVDload src mem) mem)
(Move [16] dst src mem) ->
(MOVDstore [8] dst (MOVDload [8] src mem)
(MOVDstore dst (MOVDload src mem) mem))
(Move [24] dst src mem) ->
(MOVDstore [16] dst (MOVDload [16] src mem)
(MOVDstore [8] dst (MOVDload [8] src mem)
(MOVDstore dst (MOVDload src mem) mem)))
(Move [3] dst src mem) ->
(MOVBstore [2] dst (MOVBZload [2] src mem)
(MOVHstore dst (MOVHZload src mem) mem))
(Move [5] dst src mem) ->
(MOVBstore [4] dst (MOVBZload [4] src mem)
(MOVWstore dst (MOVWZload src mem) mem))
(Move [6] dst src mem) ->
(MOVHstore [4] dst (MOVHZload [4] src mem)
(MOVWstore dst (MOVWZload src mem) mem))
(Move [7] dst src mem) ->
(MOVBstore [6] dst (MOVBZload [6] src mem)
(MOVHstore [4] dst (MOVHZload [4] src mem)
(MOVWstore dst (MOVWZload src mem) mem)))
// MVC for other moves. Use up to 4 instructions (sizes up to 1024 bytes).
(Move [s] dst src mem) && s > 0 && s <= 256 ->
(MVC [makeValAndOff(s, 0)] dst src mem)
(Move [s] dst src mem) && s > 256 && s <= 512 ->
(MVC [makeValAndOff(s-256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))
(Move [s] dst src mem) && s > 512 && s <= 768 ->
(MVC [makeValAndOff(s-512, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem)))
(Move [s] dst src mem) && s > 768 && s <= 1024 ->
(MVC [makeValAndOff(s-768, 768)] dst src (MVC [makeValAndOff(256, 512)] dst src (MVC [makeValAndOff(256, 256)] dst src (MVC [makeValAndOff(256, 0)] dst src mem))))
// Move more than 1024 bytes using a loop.
(Move [s] dst src mem) && s > 1024 ->
(LoweredMove [s%256] dst src (ADDconst <src.Type> src [(s/256)*256]) mem)
// Lowering Zero instructions
(Zero [0] _ mem) -> mem
(Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
(Zero [2] destptr mem) -> (MOVHstoreconst [0] destptr mem)
(Zero [4] destptr mem) -> (MOVWstoreconst [0] destptr mem)
(Zero [8] destptr mem) -> (MOVDstoreconst [0] destptr mem)
(Zero [3] destptr mem) ->
(MOVBstoreconst [makeValAndOff(0,2)] destptr
(MOVHstoreconst [0] destptr mem))
(Zero [5] destptr mem) ->
(MOVBstoreconst [makeValAndOff(0,4)] destptr
(MOVWstoreconst [0] destptr mem))
(Zero [6] destptr mem) ->
(MOVHstoreconst [makeValAndOff(0,4)] destptr
(MOVWstoreconst [0] destptr mem))
(Zero [7] destptr mem) ->
(MOVWstoreconst [makeValAndOff(0,3)] destptr
(MOVWstoreconst [0] destptr mem))
(Zero [s] destptr mem) && s > 0 && s <= 1024 ->
(CLEAR [makeValAndOff(s, 0)] destptr mem)
// Move more than 1024 bytes using a loop.
(Zero [s] destptr mem) && s > 1024 ->
(LoweredZero [s%256] destptr (ADDconst <destptr.Type> destptr [(s/256)*256]) mem)
// Lowering constants
(Const8 [val]) -> (MOVDconst [val])
(Const16 [val]) -> (MOVDconst [val])
(Const32 [val]) -> (MOVDconst [val])
(Const64 [val]) -> (MOVDconst [val])
(Const32F [val]) -> (FMOVSconst [val])
(Const64F [val]) -> (FMOVDconst [val])
(ConstNil) -> (MOVDconst [0])
(ConstBool [b]) -> (MOVDconst [b])
// Lowering calls
(StaticCall [argwid] {target} mem) -> (CALLstatic [argwid] {target} mem)
(ClosureCall [argwid] entry closure mem) -> (CALLclosure [argwid] entry closure mem)
(InterCall [argwid] entry mem) -> (CALLinter [argwid] entry mem)
// Miscellaneous
(Convert <t> x mem) -> (MOVDconvert <t> x mem)
(IsNonNil p) -> (MOVDNE (MOVDconst [0]) (MOVDconst [1]) (CMPconst p [0]))
(IsInBounds idx len) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
(IsSliceInBounds idx len) -> (MOVDLE (MOVDconst [0]) (MOVDconst [1]) (CMPU idx len))
(NilCheck ptr mem) -> (LoweredNilCheck ptr mem)
(GetG mem) -> (LoweredGetG mem)
(GetClosurePtr) -> (LoweredGetClosurePtr)
(Addr {sym} base) -> (MOVDaddr {sym} base)
(ITab (Load ptr mem)) -> (MOVDload ptr mem)
// block rewrites
(If (MOVDLT (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (LT cmp yes no)
(If (MOVDLE (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (LE cmp yes no)
(If (MOVDGT (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (GT cmp yes no)
(If (MOVDGE (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (GE cmp yes no)
(If (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (EQ cmp yes no)
(If (MOVDNE (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (NE cmp yes no)
// Special case for floating point - LF/LEF not generated.
(If (MOVDGTnoinv (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (GTF cmp yes no)
(If (MOVDGEnoinv (MOVDconst [0]) (MOVDconst [1]) cmp) yes no) -> (GEF cmp yes no)
(If cond yes no) -> (NE (CMPWconst [0] (MOVBZreg <types.Bool> cond)) yes no)
// ***************************
// Above: lowering rules
// Below: optimizations
// ***************************
// TODO: Should the optimizations be a separate pass?
// Fold unnecessary type conversions.
(MOVDreg <t> x) && t.Compare(x.Type) == CMPeq -> x
(MOVDnop <t> x) && t.Compare(x.Type) == CMPeq -> x
// Propagate constants through type conversions.
(MOVDreg (MOVDconst [c])) -> (MOVDconst [c])
(MOVDnop (MOVDconst [c])) -> (MOVDconst [c])
// If a register move has only 1 use, just use the same register without emitting instruction.
// MOVDnop doesn't emit instruction, only for ensuring the type.
(MOVDreg x) && x.Uses == 1 -> (MOVDnop x)
// Fold type changes into loads.
(MOVDreg <t> x:(MOVBZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBZload <t> [off] {sym} ptr mem)
(MOVDreg <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBload <t> [off] {sym} ptr mem)
(MOVDreg <t> x:(MOVHZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHZload <t> [off] {sym} ptr mem)
(MOVDreg <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHload <t> [off] {sym} ptr mem)
(MOVDreg <t> x:(MOVWZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWZload <t> [off] {sym} ptr mem)
(MOVDreg <t> x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWload <t> [off] {sym} ptr mem)
(MOVDreg <t> x:(MOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVDload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVBZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBZload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVHZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHZload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVHload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVWZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWZload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWload <t> [off] {sym} ptr mem)
(MOVDnop <t> x:(MOVDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVDload <t> [off] {sym} ptr mem)
// TODO(mundaym): uncomment rules once signed indexed loads are added.
(MOVDreg <t> x:(MOVBZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBZloadidx <t> [off] {sym} ptr idx mem)
//(MOVDreg <t> x:(MOVBloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBloadidx <t> [off] {sym} ptr idx mem)
(MOVDreg <t> x:(MOVHZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHZloadidx <t> [off] {sym} ptr idx mem)
//(MOVDreg <t> x:(MOVHloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHloadidx <t> [off] {sym} ptr idx mem)
(MOVDreg <t> x:(MOVWZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWZloadidx <t> [off] {sym} ptr idx mem)
//(MOVDreg <t> x:(MOVWloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWloadidx <t> [off] {sym} ptr idx mem)
(MOVDreg <t> x:(MOVDloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVDloadidx <t> [off] {sym} ptr idx mem)
(MOVDnop <t> x:(MOVBZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBZloadidx <t> [off] {sym} ptr idx mem)
//(MOVDnop <t> x:(MOVBloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBloadidx <t> [off] {sym} ptr idx mem)
(MOVDnop <t> x:(MOVHZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHZloadidx <t> [off] {sym} ptr idx mem)
//(MOVDnop <t> x:(MOVHloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHloadidx <t> [off] {sym} ptr idx mem)
(MOVDnop <t> x:(MOVWZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWZloadidx <t> [off] {sym} ptr idx mem)
//(MOVDnop <t> x:(MOVWloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWloadidx <t> [off] {sym} ptr idx mem)
(MOVDnop <t> x:(MOVDloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVDloadidx <t> [off] {sym} ptr idx mem)
// Fold sign extensions into conditional moves of constants.
// Designed to remove the MOVBZreg inserted by the If lowering.
(MOVBZreg x:(MOVDLT (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDLE (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDGT (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDGE (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDEQ (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDNE (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDGTnoinv (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
(MOVBZreg x:(MOVDGEnoinv (MOVDconst [c]) (MOVDconst [d]) _)) && int64(uint8(c)) == c && int64(uint8(d)) == d -> (MOVDreg x)
// Fold boolean tests into blocks.
(NE (CMPWconst [0] (MOVDLT (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (LT cmp yes no)
(NE (CMPWconst [0] (MOVDLE (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (LE cmp yes no)
(NE (CMPWconst [0] (MOVDGT (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (GT cmp yes no)
(NE (CMPWconst [0] (MOVDGE (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (GE cmp yes no)
(NE (CMPWconst [0] (MOVDEQ (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (EQ cmp yes no)
(NE (CMPWconst [0] (MOVDNE (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (NE cmp yes no)
(NE (CMPWconst [0] (MOVDGTnoinv (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (GTF cmp yes no)
(NE (CMPWconst [0] (MOVDGEnoinv (MOVDconst [0]) (MOVDconst [1]) cmp)) yes no) -> (GEF cmp yes no)
// Fold constants into instructions.
(ADD x (MOVDconst [c])) && is32Bit(c) -> (ADDconst [c] x)
(ADDW x (MOVDconst [c])) -> (ADDWconst [c] x)
(SUB x (MOVDconst [c])) && is32Bit(c) -> (SUBconst x [c])
(SUB (MOVDconst [c]) x) && is32Bit(c) -> (NEG (SUBconst <v.Type> x [c]))
(SUBW x (MOVDconst [c])) -> (SUBWconst x [c])
(SUBW (MOVDconst [c]) x) -> (NEGW (SUBWconst <v.Type> x [c]))
(MULLD x (MOVDconst [c])) && is32Bit(c) -> (MULLDconst [c] x)
(MULLW x (MOVDconst [c])) -> (MULLWconst [c] x)
// NILF instructions leave the high 32 bits unchanged which is
// equivalent to the leftmost 32 bits being set.
// TODO(mundaym): modify the assembler to accept 64-bit values
// and use isU32Bit(^c).
(AND x (MOVDconst [c])) && is32Bit(c) && c < 0 -> (ANDconst [c] x)
(ANDW x (MOVDconst [c])) -> (ANDWconst [c] x)
(ANDWconst [c] (ANDWconst [d] x)) -> (ANDWconst [c & d] x)
(ANDconst [c] (ANDconst [d] x)) -> (ANDconst [c & d] x)
(OR x (MOVDconst [c])) && isU32Bit(c) -> (ORconst [c] x)
(ORW x (MOVDconst [c])) -> (ORWconst [c] x)
(XOR x (MOVDconst [c])) && isU32Bit(c) -> (XORconst [c] x)
(XORW x (MOVDconst [c])) -> (XORWconst [c] x)
(SLD x (MOVDconst [c])) -> (SLDconst [c&63] x)
(SLW x (MOVDconst [c])) -> (SLWconst [c&63] x)
(SRD x (MOVDconst [c])) -> (SRDconst [c&63] x)
(SRW x (MOVDconst [c])) -> (SRWconst [c&63] x)
(SRAD x (MOVDconst [c])) -> (SRADconst [c&63] x)
(SRAW x (MOVDconst [c])) -> (SRAWconst [c&63] x)
(SRAW x (ANDWconst [63] y)) -> (SRAW x y)
(SRAD x (ANDconst [63] y)) -> (SRAD x y)
(SLW x (ANDWconst [63] y)) -> (SLW x y)
(SLD x (ANDconst [63] y)) -> (SLD x y)
(SRW x (ANDWconst [63] y)) -> (SRW x y)
(SRD x (ANDconst [63] y)) -> (SRD x y)
// Rotate generation
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
(ADD (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
( OR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
(XOR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
(ADDW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c -> (RLLconst [c] x)
( ORW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c -> (RLLconst [c] x)
(XORW (SLWconst x [c]) (SRWconst x [d])) && d == 32-c -> (RLLconst [c] x)
(CMP x (MOVDconst [c])) && is32Bit(c) -> (CMPconst x [c])
(CMP (MOVDconst [c]) x) && is32Bit(c) -> (InvertFlags (CMPconst x [c]))
(CMPW x (MOVDconst [c])) -> (CMPWconst x [c])
(CMPW (MOVDconst [c]) x) -> (InvertFlags (CMPWconst x [c]))
(CMPU x (MOVDconst [c])) && is32Bit(c) -> (CMPUconst x [int64(uint32(c))])
(CMPU (MOVDconst [c]) x) && is32Bit(c) -> (InvertFlags (CMPUconst x [int64(uint32(c))]))
(CMPWU x (MOVDconst [c])) -> (CMPWUconst x [int64(uint32(c))])
(CMPWU (MOVDconst [c]) x) -> (InvertFlags (CMPWUconst x [int64(uint32(c))]))
// Using MOV{W,H,B}Zreg instead of AND is cheaper.
(AND x (MOVDconst [0xFF])) -> (MOVBZreg x)
(AND x (MOVDconst [0xFFFF])) -> (MOVHZreg x)
(AND x (MOVDconst [0xFFFFFFFF])) -> (MOVWZreg x)
(ANDWconst [0xFF] x) -> (MOVBZreg x)
(ANDWconst [0xFFFF] x) -> (MOVHZreg x)
// strength reduction
(MULLDconst [-1] x) -> (NEG x)
(MULLDconst [0] _) -> (MOVDconst [0])
(MULLDconst [1] x) -> x
(MULLDconst [c] x) && isPowerOfTwo(c) -> (SLDconst [log2(c)] x)
(MULLDconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUB (SLDconst <v.Type> [log2(c+1)] x) x)
(MULLDconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADD (SLDconst <v.Type> [log2(c-1)] x) x)
(MULLWconst [-1] x) -> (NEGW x)
(MULLWconst [0] _) -> (MOVDconst [0])
(MULLWconst [1] x) -> x
(MULLWconst [c] x) && isPowerOfTwo(c) -> (SLWconst [log2(c)] x)
(MULLWconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBW (SLWconst <v.Type> [log2(c+1)] x) x)
(MULLWconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (ADDW (SLWconst <v.Type> [log2(c-1)] x) x)
// Fold ADD into MOVDaddr. Odd offsets from SB shouldn't be folded (LARL can't handle them).
(ADDconst [c] (MOVDaddr [d] {s} x:(SB))) && ((c+d)&1 == 0) && is32Bit(c+d) -> (MOVDaddr [c+d] {s} x)
(ADDconst [c] (MOVDaddr [d] {s} x)) && x.Op != OpSB && is20Bit(c+d) -> (MOVDaddr [c+d] {s} x)
(ADD x (MOVDaddr [c] {s} y)) && x.Op != OpSB && y.Op != OpSB -> (MOVDaddridx [c] {s} x y)
// fold ADDconst into MOVDaddrx
(ADDconst [c] (MOVDaddridx [d] {s} x y)) && is20Bit(c+d) -> (MOVDaddridx [c+d] {s} x y)
(MOVDaddridx [c] {s} (ADDconst [d] x) y) && is20Bit(c+d) && x.Op != OpSB -> (MOVDaddridx [c+d] {s} x y)
(MOVDaddridx [c] {s} x (ADDconst [d] y)) && is20Bit(c+d) && y.Op != OpSB -> (MOVDaddridx [c+d] {s} x y)
// reverse ordering of compare instruction
(MOVDLT x y (InvertFlags cmp)) -> (MOVDGT x y cmp)
(MOVDGT x y (InvertFlags cmp)) -> (MOVDLT x y cmp)
(MOVDLE x y (InvertFlags cmp)) -> (MOVDGE x y cmp)
(MOVDGE x y (InvertFlags cmp)) -> (MOVDLE x y cmp)
(MOVDEQ x y (InvertFlags cmp)) -> (MOVDEQ x y cmp)
(MOVDNE x y (InvertFlags cmp)) -> (MOVDNE x y cmp)
// don't extend after proper load
(MOVBreg x:(MOVBload _ _)) -> (MOVDreg x)
(MOVBZreg x:(MOVBZload _ _)) -> (MOVDreg x)
(MOVHreg x:(MOVBload _ _)) -> (MOVDreg x)
(MOVHreg x:(MOVBZload _ _)) -> (MOVDreg x)
(MOVHreg x:(MOVHload _ _)) -> (MOVDreg x)
(MOVHZreg x:(MOVBZload _ _)) -> (MOVDreg x)
(MOVHZreg x:(MOVHZload _ _)) -> (MOVDreg x)
(MOVWreg x:(MOVBload _ _)) -> (MOVDreg x)
(MOVWreg x:(MOVBZload _ _)) -> (MOVDreg x)
(MOVWreg x:(MOVHload _ _)) -> (MOVDreg x)
(MOVWreg x:(MOVHZload _ _)) -> (MOVDreg x)
(MOVWreg x:(MOVWload _ _)) -> (MOVDreg x)
(MOVWZreg x:(MOVBZload _ _)) -> (MOVDreg x)
(MOVWZreg x:(MOVHZload _ _)) -> (MOVDreg x)
(MOVWZreg x:(MOVWZload _ _)) -> (MOVDreg x)
// don't extend if argument is already extended
(MOVBreg x:(Arg <t>)) && is8BitInt(t) && isSigned(t) -> (MOVDreg x)
(MOVBZreg x:(Arg <t>)) && is8BitInt(t) && !isSigned(t) -> (MOVDreg x)
(MOVHreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && isSigned(t) -> (MOVDreg x)
(MOVHZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t)) && !isSigned(t) -> (MOVDreg x)
(MOVWreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && isSigned(t) -> (MOVDreg x)
(MOVWZreg x:(Arg <t>)) && (is8BitInt(t) || is16BitInt(t) || is32BitInt(t)) && !isSigned(t) -> (MOVDreg x)
// fold double extensions
(MOVBreg x:(MOVBreg _)) -> (MOVDreg x)
(MOVBZreg x:(MOVBZreg _)) -> (MOVDreg x)
(MOVHreg x:(MOVBreg _)) -> (MOVDreg x)
(MOVHreg x:(MOVBZreg _)) -> (MOVDreg x)
(MOVHreg x:(MOVHreg _)) -> (MOVDreg x)
(MOVHZreg x:(MOVBZreg _)) -> (MOVDreg x)
(MOVHZreg x:(MOVHZreg _)) -> (MOVDreg x)
(MOVWreg x:(MOVBreg _)) -> (MOVDreg x)
(MOVWreg x:(MOVBZreg _)) -> (MOVDreg x)
(MOVWreg x:(MOVHreg _)) -> (MOVDreg x)
(MOVWreg x:(MOVHreg _)) -> (MOVDreg x)
(MOVWreg x:(MOVWreg _)) -> (MOVDreg x)
(MOVWZreg x:(MOVBZreg _)) -> (MOVDreg x)
(MOVWZreg x:(MOVHZreg _)) -> (MOVDreg x)
(MOVWZreg x:(MOVWZreg _)) -> (MOVDreg x)
// fold extensions into constants
(MOVBreg (MOVDconst [c])) -> (MOVDconst [int64(int8(c))])
(MOVBZreg (MOVDconst [c])) -> (MOVDconst [int64(uint8(c))])
(MOVHreg (MOVDconst [c])) -> (MOVDconst [int64(int16(c))])
(MOVHZreg (MOVDconst [c])) -> (MOVDconst [int64(uint16(c))])
(MOVWreg (MOVDconst [c])) -> (MOVDconst [int64(int32(c))])
(MOVWZreg (MOVDconst [c])) -> (MOVDconst [int64(uint32(c))])
// sign extended loads
// Note: The combined instruction must end up in the same block
// as the original load. If not, we end up making a value with
// memory type live in two different blocks, which can lead to
// multiple memory values alive simultaneously.
// Make sure we don't combine these ops if the load has another use.
// This prevents a single load from being split into multiple loads
// which then might return different values. See test/atomicload.go.
(MOVBreg x:(MOVBZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
(MOVBZreg x:(MOVBZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBZload <v.Type> [off] {sym} ptr mem)
(MOVHreg x:(MOVHZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHload <v.Type> [off] {sym} ptr mem)
(MOVHZreg x:(MOVHZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHZload <v.Type> [off] {sym} ptr mem)
(MOVWreg x:(MOVWZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
(MOVWZreg x:(MOVWZload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWZload <v.Type> [off] {sym} ptr mem)
(MOVBZreg x:(MOVBZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBZloadidx <v.Type> [off] {sym} ptr idx mem)
(MOVHZreg x:(MOVHZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVHZloadidx <v.Type> [off] {sym} ptr idx mem)
(MOVWZreg x:(MOVWZloadidx [off] {sym} ptr idx mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWZloadidx <v.Type> [off] {sym} ptr idx mem)
// replace load from same location as preceding store with copy
(MOVBZload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBZreg x)
(MOVHZload [off] {sym} ptr (MOVHstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVHZreg x)
(MOVWZload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWZreg x)
(MOVDload [off] {sym} ptr (MOVDstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVDreg x)
// Don't extend before storing
(MOVWstore [off] {sym} ptr (MOVWreg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
(MOVHstore [off] {sym} ptr (MOVHreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
(MOVBstore [off] {sym} ptr (MOVBreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
(MOVWstore [off] {sym} ptr (MOVWZreg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
(MOVHstore [off] {sym} ptr (MOVHZreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
(MOVBstore [off] {sym} ptr (MOVBZreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
// Fold constants into memory operations.
// Note that this is not always a good idea because if not all the uses of
// the ADDconst get eliminated, we still have to compute the ADDconst and we now
// have potentially two live values (ptr and (ADDconst [off] ptr)) instead of one.
// Nevertheless, let's do it!
(MOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVDload [off1+off2] {sym} ptr mem)
(MOVWload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVWload [off1+off2] {sym} ptr mem)
(MOVHload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVHload [off1+off2] {sym} ptr mem)
(MOVBload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVBload [off1+off2] {sym} ptr mem)
(MOVWZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVWZload [off1+off2] {sym} ptr mem)
(MOVHZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVHZload [off1+off2] {sym} ptr mem)
(MOVBZload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (MOVBZload [off1+off2] {sym} ptr mem)
(FMOVSload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (FMOVSload [off1+off2] {sym} ptr mem)
(FMOVDload [off1] {sym} (ADDconst [off2] ptr) mem) && is20Bit(off1+off2) -> (FMOVDload [off1+off2] {sym} ptr mem)
(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVDstore [off1+off2] {sym} ptr val mem)
(MOVWstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVWstore [off1+off2] {sym} ptr val mem)
(MOVHstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVHstore [off1+off2] {sym} ptr val mem)
(MOVBstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (MOVBstore [off1+off2] {sym} ptr val mem)
(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (FMOVSstore [off1+off2] {sym} ptr val mem)
(FMOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is20Bit(off1+off2) -> (FMOVDstore [off1+off2] {sym} ptr val mem)
// Fold constants into stores.
(MOVDstore [off] {sym} ptr (MOVDconst [c]) mem) && validValAndOff(c,off) && int64(int16(c)) == c && ptr.Op != OpSB ->
(MOVDstoreconst [makeValAndOff(c,off)] {sym} ptr mem)
(MOVWstore [off] {sym} ptr (MOVDconst [c]) mem) && validOff(off) && int64(int16(c)) == c && ptr.Op != OpSB ->
(MOVWstoreconst [makeValAndOff(int64(int32(c)),off)] {sym} ptr mem)
(MOVHstore [off] {sym} ptr (MOVDconst [c]) mem) && validOff(off) && ptr.Op != OpSB ->
(MOVHstoreconst [makeValAndOff(int64(int16(c)),off)] {sym} ptr mem)
(MOVBstore [off] {sym} ptr (MOVDconst [c]) mem) && validOff(off) && ptr.Op != OpSB ->
(MOVBstoreconst [makeValAndOff(int64(int8(c)),off)] {sym} ptr mem)
// Fold address offsets into constant stores.
(MOVDstoreconst [sc] {s} (ADDconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
(MOVDstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
(MOVWstoreconst [sc] {s} (ADDconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
(MOVWstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
(MOVHstoreconst [sc] {s} (ADDconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
(MOVHstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
(MOVBstoreconst [sc] {s} (ADDconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
(MOVBstoreconst [ValAndOff(sc).add(off)] {s} ptr mem)
// We need to fold MOVDaddr into the MOVx ops so that the live variable analysis knows
// what variables are being read/written by the ops.
(MOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVWZload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVWZload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVHZload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVHZload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVBZload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVBZload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(FMOVSload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVHload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVWload [off1] {sym1} (MOVDaddr [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVWstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVHstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVBstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(FMOVDstore [off1] {sym1} (MOVDaddr [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVDstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
(MOVDstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
(MOVWstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
(MOVWstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
(MOVHstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
(MOVHstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
(MOVBstoreconst [sc] {sym1} (MOVDaddr [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
(MOVBstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
// generating indexed loads and stores
(MOVBZload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVBZloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVHZload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVHZloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVWZload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVWZloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVDload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVDloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
(FMOVSload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVSloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
(FMOVDload [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVDloadidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx mem)
(MOVBstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVBstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
(MOVHstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVHstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
(MOVWstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVWstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
(MOVDstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVDstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
(FMOVSstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVSstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
(FMOVDstore [off1] {sym1} (MOVDaddridx [off2] {sym2} ptr idx) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(FMOVDstoreidx [off1+off2] {mergeSym(sym1,sym2)} ptr idx val mem)
(MOVBZload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVBZloadidx [off] {sym} ptr idx mem)
(MOVHZload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVHZloadidx [off] {sym} ptr idx mem)
(MOVWZload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVWZloadidx [off] {sym} ptr idx mem)
(MOVDload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (MOVDloadidx [off] {sym} ptr idx mem)
(FMOVSload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (FMOVSloadidx [off] {sym} ptr idx mem)
(FMOVDload [off] {sym} (ADD ptr idx) mem) && ptr.Op != OpSB -> (FMOVDloadidx [off] {sym} ptr idx mem)
(MOVBstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVBstoreidx [off] {sym} ptr idx val mem)
(MOVHstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVHstoreidx [off] {sym} ptr idx val mem)
(MOVWstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVWstoreidx [off] {sym} ptr idx val mem)
(MOVDstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (MOVDstoreidx [off] {sym} ptr idx val mem)
(FMOVSstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (FMOVSstoreidx [off] {sym} ptr idx val mem)
(FMOVDstore [off] {sym} (ADD ptr idx) val mem) && ptr.Op != OpSB -> (FMOVDstoreidx [off] {sym} ptr idx val mem)
// combine ADD into indexed loads and stores
(MOVBZloadidx [c] {sym} (ADDconst [d] ptr) idx mem) -> (MOVBZloadidx [c+d] {sym} ptr idx mem)
(MOVHZloadidx [c] {sym} (ADDconst [d] ptr) idx mem) -> (MOVHZloadidx [c+d] {sym} ptr idx mem)
(MOVWZloadidx [c] {sym} (ADDconst [d] ptr) idx mem) -> (MOVWZloadidx [c+d] {sym} ptr idx mem)
(MOVDloadidx [c] {sym} (ADDconst [d] ptr) idx mem) -> (MOVDloadidx [c+d] {sym} ptr idx mem)
(FMOVSloadidx [c] {sym} (ADDconst [d] ptr) idx mem) -> (FMOVSloadidx [c+d] {sym} ptr idx mem)
(FMOVDloadidx [c] {sym} (ADDconst [d] ptr) idx mem) -> (FMOVDloadidx [c+d] {sym} ptr idx mem)
(MOVBstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) -> (MOVBstoreidx [c+d] {sym} ptr idx val mem)
(MOVHstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) -> (MOVHstoreidx [c+d] {sym} ptr idx val mem)
(MOVWstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) -> (MOVWstoreidx [c+d] {sym} ptr idx val mem)
(MOVDstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) -> (MOVDstoreidx [c+d] {sym} ptr idx val mem)
(FMOVSstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) -> (FMOVSstoreidx [c+d] {sym} ptr idx val mem)
(FMOVDstoreidx [c] {sym} (ADDconst [d] ptr) idx val mem) -> (FMOVDstoreidx [c+d] {sym} ptr idx val mem)
(MOVBZloadidx [c] {sym} ptr (ADDconst [d] idx) mem) -> (MOVBZloadidx [c+d] {sym} ptr idx mem)
(MOVHZloadidx [c] {sym} ptr (ADDconst [d] idx) mem) -> (MOVHZloadidx [c+d] {sym} ptr idx mem)
(MOVWZloadidx [c] {sym} ptr (ADDconst [d] idx) mem) -> (MOVWZloadidx [c+d] {sym} ptr idx mem)
(MOVDloadidx [c] {sym} ptr (ADDconst [d] idx) mem) -> (MOVDloadidx [c+d] {sym} ptr idx mem)
(FMOVSloadidx [c] {sym} ptr (ADDconst [d] idx) mem) -> (FMOVSloadidx [c+d] {sym} ptr idx mem)
(FMOVDloadidx [c] {sym} ptr (ADDconst [d] idx) mem) -> (FMOVDloadidx [c+d] {sym} ptr idx mem)
(MOVBstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) -> (MOVBstoreidx [c+d] {sym} ptr idx val mem)
(MOVHstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) -> (MOVHstoreidx [c+d] {sym} ptr idx val mem)
(MOVWstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) -> (MOVWstoreidx [c+d] {sym} ptr idx val mem)
(MOVDstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) -> (MOVDstoreidx [c+d] {sym} ptr idx val mem)
(FMOVSstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) -> (FMOVSstoreidx [c+d] {sym} ptr idx val mem)
(FMOVDstoreidx [c] {sym} ptr (ADDconst [d] idx) val mem) -> (FMOVDstoreidx [c+d] {sym} ptr idx val mem)
// MOVDaddr into MOVDaddridx
(MOVDaddridx [off1] {sym1} (MOVDaddr [off2] {sym2} x) y) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
(MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
(MOVDaddridx [off1] {sym1} x (MOVDaddr [off2] {sym2} y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && y.Op != OpSB ->
(MOVDaddridx [off1+off2] {mergeSym(sym1,sym2)} x y)
// Absorb InvertFlags into branches.
(LT (InvertFlags cmp) yes no) -> (GT cmp yes no)
(GT (InvertFlags cmp) yes no) -> (LT cmp yes no)
(LE (InvertFlags cmp) yes no) -> (GE cmp yes no)
(GE (InvertFlags cmp) yes no) -> (LE cmp yes no)
(EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
(NE (InvertFlags cmp) yes no) -> (NE cmp yes no)
// Constant comparisons.
(CMPconst (MOVDconst [x]) [y]) && x==y -> (FlagEQ)
(CMPconst (MOVDconst [x]) [y]) && x<y -> (FlagLT)
(CMPconst (MOVDconst [x]) [y]) && x>y -> (FlagGT)
(CMPUconst (MOVDconst [x]) [y]) && uint64(x)==uint64(y) -> (FlagEQ)
(CMPUconst (MOVDconst [x]) [y]) && uint64(x)<uint64(y) -> (FlagLT)
(CMPUconst (MOVDconst [x]) [y]) && uint64(x)>uint64(y) -> (FlagGT)
(CMPWconst (MOVDconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
(CMPWconst (MOVDconst [x]) [y]) && int32(x)<int32(y) -> (FlagLT)
(CMPWconst (MOVDconst [x]) [y]) && int32(x)>int32(y) -> (FlagGT)
(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)==uint32(y) -> (FlagEQ)
(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
(CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)
// Other known comparisons.
(CMPconst (MOVBZreg _) [c]) && 0xFF < c -> (FlagLT)
(CMPconst (MOVHZreg _) [c]) && 0xFFFF < c -> (FlagLT)
(CMPconst (MOVWZreg _) [c]) && 0xFFFFFFFF < c -> (FlagLT)
(CMPWconst (SRWconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) -> (FlagLT)
(CMPconst (SRDconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 64 && (1<<uint64(64-c)) <= uint64(n) -> (FlagLT)
(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n -> (FlagLT)
(CMPWconst (ANDWconst _ [m]) [n]) && 0 <= int32(m) && int32(m) < int32(n) -> (FlagLT)
// Absorb flag constants into SBB ops.
(SUBEcarrymask (FlagEQ)) -> (MOVDconst [-1])
(SUBEcarrymask (FlagLT)) -> (MOVDconst [-1])
(SUBEcarrymask (FlagGT)) -> (MOVDconst [0])
(SUBEWcarrymask (FlagEQ)) -> (MOVDconst [-1])
(SUBEWcarrymask (FlagLT)) -> (MOVDconst [-1])
(SUBEWcarrymask (FlagGT)) -> (MOVDconst [0])
// Absorb flag constants into branches.
(EQ (FlagEQ) yes no) -> (First nil yes no)
(EQ (FlagLT) yes no) -> (First nil no yes)
(EQ (FlagGT) yes no) -> (First nil no yes)
(NE (FlagEQ) yes no) -> (First nil no yes)
(NE (FlagLT) yes no) -> (First nil yes no)
(NE (FlagGT) yes no) -> (First nil yes no)
(LT (FlagEQ) yes no) -> (First nil no yes)
(LT (FlagLT) yes no) -> (First nil yes no)
(LT (FlagGT) yes no) -> (First nil no yes)
(LE (FlagEQ) yes no) -> (First nil yes no)
(LE (FlagLT) yes no) -> (First nil yes no)
(LE (FlagGT) yes no) -> (First nil no yes)
(GT (FlagEQ) yes no) -> (First nil no yes)
(GT (FlagLT) yes no) -> (First nil no yes)
(GT (FlagGT) yes no) -> (First nil yes no)
(GE (FlagEQ) yes no) -> (First nil yes no)
(GE (FlagLT) yes no) -> (First nil no yes)
(GE (FlagGT) yes no) -> (First nil yes no)
// Absorb flag constants into SETxx ops.
(MOVDEQ _ x (FlagEQ)) -> x
(MOVDEQ y _ (FlagLT)) -> y
(MOVDEQ y _ (FlagGT)) -> y
(MOVDNE y _ (FlagEQ)) -> y
(MOVDNE _ x (FlagLT)) -> x
(MOVDNE _ x (FlagGT)) -> x
(MOVDLT y _ (FlagEQ)) -> y
(MOVDLT _ x (FlagLT)) -> x
(MOVDLT y _ (FlagGT)) -> y
(MOVDLE _ x (FlagEQ)) -> x
(MOVDLE _ x (FlagLT)) -> x
(MOVDLE y _ (FlagGT)) -> y
(MOVDGT y _ (FlagEQ)) -> y
(MOVDGT y _ (FlagLT)) -> y
(MOVDGT _ x (FlagGT)) -> x
(MOVDGE _ x (FlagEQ)) -> x
(MOVDGE y _ (FlagLT)) -> y
(MOVDGE _ x (FlagGT)) -> x
// Remove redundant *const ops
(ADDconst [0] x) -> x
(ADDWconst [c] x) && int32(c)==0 -> x
(SUBconst [0] x) -> x
(SUBWconst [c] x) && int32(c) == 0 -> x
(ANDconst [0] _) -> (MOVDconst [0])
(ANDWconst [c] _) && int32(c)==0 -> (MOVDconst [0])
(ANDconst [-1] x) -> x
(ANDWconst [c] x) && int32(c)==-1 -> x
(ORconst [0] x) -> x
(ORWconst [c] x) && int32(c)==0 -> x
(ORconst [-1] _) -> (MOVDconst [-1])
(ORWconst [c] _) && int32(c)==-1 -> (MOVDconst [-1])
(XORconst [0] x) -> x
(XORWconst [c] x) && int32(c)==0 -> x
// Convert constant subtracts to constant adds.
(SUBconst [c] x) && c != -(1<<31) -> (ADDconst [-c] x)
(SUBWconst [c] x) -> (ADDWconst [int64(int32(-c))] x)
// generic constant folding
// TODO: more of this
(ADDconst [c] (MOVDconst [d])) -> (MOVDconst [c+d])
(ADDWconst [c] (MOVDconst [d])) -> (MOVDconst [int64(int32(c+d))])
(ADDconst [c] (ADDconst [d] x)) && is32Bit(c+d) -> (ADDconst [c+d] x)
(ADDWconst [c] (ADDWconst [d] x)) -> (ADDWconst [int64(int32(c+d))] x)
(SUBconst (MOVDconst [d]) [c]) -> (MOVDconst [d-c])
(SUBconst (SUBconst x [d]) [c]) && is32Bit(-c-d) -> (ADDconst [-c-d] x)
(SRADconst [c] (MOVDconst [d])) -> (MOVDconst [d>>uint64(c)])
(SRAWconst [c] (MOVDconst [d])) -> (MOVDconst [d>>uint64(c)])
(NEG (MOVDconst [c])) -> (MOVDconst [-c])
(NEGW (MOVDconst [c])) -> (MOVDconst [int64(int32(-c))])
(MULLDconst [c] (MOVDconst [d])) -> (MOVDconst [c*d])
(MULLWconst [c] (MOVDconst [d])) -> (MOVDconst [int64(int32(c*d))])
(AND (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c&d])
(ANDconst [c] (MOVDconst [d])) -> (MOVDconst [c&d])
(ANDWconst [c] (MOVDconst [d])) -> (MOVDconst [c&d])
(OR (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c|d])
(ORconst [c] (MOVDconst [d])) -> (MOVDconst [c|d])
(ORWconst [c] (MOVDconst [d])) -> (MOVDconst [c|d])
(XOR (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c^d])
(XORconst [c] (MOVDconst [d])) -> (MOVDconst [c^d])
(XORWconst [c] (MOVDconst [d])) -> (MOVDconst [c^d])
(LoweredRound32F x:(FMOVSconst)) -> x
(LoweredRound64F x:(FMOVDconst)) -> x
// generic simplifications
// TODO: more of this
(ADD x (NEG y)) -> (SUB x y)
(ADDW x (NEGW y)) -> (SUBW x y)
(SUB x x) -> (MOVDconst [0])
(SUBW x x) -> (MOVDconst [0])
(AND x x) -> x
(ANDW x x) -> x
(OR x x) -> x
(ORW x x) -> x
(XOR x x) -> (MOVDconst [0])
(XORW x x) -> (MOVDconst [0])
(NEG (ADDconst [c] (NEG x))) && c != -(1<<31) -> (ADDconst [-c] x)
// fused multiply-add
(FADD (FMUL y z) x) -> (FMADD x y z)
(FADDS (FMULS y z) x) -> (FMADDS x y z)
(FSUB (FMUL y z) x) -> (FMSUB x y z)
(FSUBS (FMULS y z) x) -> (FMSUBS x y z)
// Fold memory operations into operations.
// Exclude global data (SB) because these instructions cannot handle relative addresses.
// TODO(mundaym): use LARL in the assembler to handle SB?
// TODO(mundaym): indexed versions of these?
(ADD <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ADDload <t> [off] {sym} x ptr mem)
(ADD <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ADDload <t> [off] {sym} x ptr mem)
(ADDW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ADDWload <t> [off] {sym} x ptr mem)
(ADDW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ADDWload <t> [off] {sym} x ptr mem)
(ADDW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ADDWload <t> [off] {sym} x ptr mem)
(ADDW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ADDWload <t> [off] {sym} x ptr mem)
(MULLD <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (MULLDload <t> [off] {sym} x ptr mem)
(MULLD <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (MULLDload <t> [off] {sym} x ptr mem)
(MULLW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (MULLWload <t> [off] {sym} x ptr mem)
(MULLW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (MULLWload <t> [off] {sym} x ptr mem)
(MULLW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (MULLWload <t> [off] {sym} x ptr mem)
(MULLW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (MULLWload <t> [off] {sym} x ptr mem)
(SUB <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (SUBload <t> [off] {sym} x ptr mem)
(SUBW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (SUBWload <t> [off] {sym} x ptr mem)
(SUBW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (SUBWload <t> [off] {sym} x ptr mem)
(AND <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ANDload <t> [off] {sym} x ptr mem)
(AND <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ANDload <t> [off] {sym} x ptr mem)
(ANDW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ANDWload <t> [off] {sym} x ptr mem)
(ANDW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ANDWload <t> [off] {sym} x ptr mem)
(ANDW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ANDWload <t> [off] {sym} x ptr mem)
(ANDW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ANDWload <t> [off] {sym} x ptr mem)
(OR <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ORload <t> [off] {sym} x ptr mem)
(OR <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ORload <t> [off] {sym} x ptr mem)
(ORW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ORWload <t> [off] {sym} x ptr mem)
(ORW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ORWload <t> [off] {sym} x ptr mem)
(ORW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ORWload <t> [off] {sym} x ptr mem)
(ORW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (ORWload <t> [off] {sym} x ptr mem)
(XOR <t> x g:(MOVDload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (XORload <t> [off] {sym} x ptr mem)
(XOR <t> g:(MOVDload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (XORload <t> [off] {sym} x ptr mem)
(XORW <t> x g:(MOVWload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (XORWload <t> [off] {sym} x ptr mem)
(XORW <t> g:(MOVWload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (XORWload <t> [off] {sym} x ptr mem)
(XORW <t> x g:(MOVWZload [off] {sym} ptr mem)) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (XORWload <t> [off] {sym} x ptr mem)
(XORW <t> g:(MOVWZload [off] {sym} ptr mem) x) && ptr.Op != OpSB && is20Bit(off) && canMergeLoad(v, g, x) && clobber(g)
-> (XORWload <t> [off] {sym} x ptr mem)
// Combine constant stores into larger (unaligned) stores.
// It doesn't work to global data (based on SB),
// because STGRL doesn't support unaligned address
(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
&& p.Op != OpSB
&& x.Uses == 1
&& ValAndOff(a).Off() + 1 == ValAndOff(c).Off()
&& clobber(x)
-> (MOVHstoreconst [makeValAndOff(ValAndOff(c).Val()&0xff | ValAndOff(a).Val()<<8, ValAndOff(a).Off())] {s} p mem)
(MOVHstoreconst [c] {s} p x:(MOVHstoreconst [a] {s} p mem))
&& p.Op != OpSB
&& x.Uses == 1
&& ValAndOff(a).Off() + 2 == ValAndOff(c).Off()
&& clobber(x)
-> (MOVWstoreconst [makeValAndOff(ValAndOff(c).Val()&0xffff | ValAndOff(a).Val()<<16, ValAndOff(a).Off())] {s} p mem)
(MOVWstoreconst [c] {s} p x:(MOVWstoreconst [a] {s} p mem))
&& p.Op != OpSB
&& x.Uses == 1
&& ValAndOff(a).Off() + 4 == ValAndOff(c).Off()
&& clobber(x)
-> (MOVDstore [ValAndOff(a).Off()] {s} p (MOVDconst [ValAndOff(c).Val()&0xffffffff | ValAndOff(a).Val()<<32]) mem)
// Combine stores into larger (unaligned) stores.
// It doesn't work on global data (based on SB) because stores with relative addressing
// require that the memory operand be aligned.
(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRDconst [8] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHstore [i-1] {s} p w mem)
(MOVBstore [i] {s} p w0:(SRDconst [j] w) x:(MOVBstore [i-1] {s} p (SRDconst [j+8] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHstore [i-1] {s} p w0 mem)
(MOVBstore [i] {s} p w x:(MOVBstore [i-1] {s} p (SRWconst [8] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHstore [i-1] {s} p w mem)
(MOVBstore [i] {s} p w0:(SRWconst [j] w) x:(MOVBstore [i-1] {s} p (SRWconst [j+8] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHstore [i-1] {s} p w0 mem)
(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRDconst [16] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVWstore [i-2] {s} p w mem)
(MOVHstore [i] {s} p w0:(SRDconst [j] w) x:(MOVHstore [i-2] {s} p (SRDconst [j+16] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVWstore [i-2] {s} p w0 mem)
(MOVHstore [i] {s} p w x:(MOVHstore [i-2] {s} p (SRWconst [16] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVWstore [i-2] {s} p w mem)
(MOVHstore [i] {s} p w0:(SRWconst [j] w) x:(MOVHstore [i-2] {s} p (SRWconst [j+16] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVWstore [i-2] {s} p w0 mem)
(MOVWstore [i] {s} p (SRDconst [32] w) x:(MOVWstore [i-4] {s} p w mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVDstore [i-4] {s} p w mem)
(MOVWstore [i] {s} p w0:(SRDconst [j] w) x:(MOVWstore [i-4] {s} p (SRDconst [j+32] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVDstore [i-4] {s} p w0 mem)
(MOVBstoreidx [i] {s} p idx w x:(MOVBstoreidx [i-1] {s} p idx (SRDconst [8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHstoreidx [i-1] {s} p idx w mem)
(MOVBstoreidx [i] {s} p idx w0:(SRDconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx (SRDconst [j+8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHstoreidx [i-1] {s} p idx w0 mem)
(MOVBstoreidx [i] {s} p idx w x:(MOVBstoreidx [i-1] {s} p idx (SRWconst [8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHstoreidx [i-1] {s} p idx w mem)
(MOVBstoreidx [i] {s} p idx w0:(SRWconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx (SRWconst [j+8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHstoreidx [i-1] {s} p idx w0 mem)
(MOVHstoreidx [i] {s} p idx w x:(MOVHstoreidx [i-2] {s} p idx (SRDconst [16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstoreidx [i-2] {s} p idx w mem)
(MOVHstoreidx [i] {s} p idx w0:(SRDconst [j] w) x:(MOVHstoreidx [i-2] {s} p idx (SRDconst [j+16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstoreidx [i-2] {s} p idx w0 mem)
(MOVHstoreidx [i] {s} p idx w x:(MOVHstoreidx [i-2] {s} p idx (SRWconst [16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstoreidx [i-2] {s} p idx w mem)
(MOVHstoreidx [i] {s} p idx w0:(SRWconst [j] w) x:(MOVHstoreidx [i-2] {s} p idx (SRWconst [j+16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWstoreidx [i-2] {s} p idx w0 mem)
(MOVWstoreidx [i] {s} p idx w x:(MOVWstoreidx [i-4] {s} p idx (SRDconst [32] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVDstoreidx [i-4] {s} p idx w mem)
(MOVWstoreidx [i] {s} p idx w0:(SRDconst [j] w) x:(MOVWstoreidx [i-4] {s} p idx (SRDconst [j+32] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVDstoreidx [i-4] {s} p idx w0 mem)
// Combine stores into larger (unaligned) stores with the bytes reversed (little endian).
// Store-with-bytes-reversed instructions do not support relative memory addresses,
// so these stores can't operate on global data (SB).
(MOVBstore [i] {s} p (SRDconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstore [i-1] {s} p w mem)
(MOVBstore [i] {s} p (SRDconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRDconst [j-8] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstore [i-1] {s} p w0 mem)
(MOVBstore [i] {s} p (SRWconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstore [i-1] {s} p w mem)
(MOVBstore [i] {s} p (SRWconst [j] w) x:(MOVBstore [i-1] {s} p w0:(SRWconst [j-8] w) mem))
&& p.Op != OpSB
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstore [i-1] {s} p w0 mem)
(MOVHBRstore [i] {s} p (SRDconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstore [i-2] {s} p w mem)
(MOVHBRstore [i] {s} p (SRDconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRDconst [j-16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstore [i-2] {s} p w0 mem)
(MOVHBRstore [i] {s} p (SRWconst [16] w) x:(MOVHBRstore [i-2] {s} p w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstore [i-2] {s} p w mem)
(MOVHBRstore [i] {s} p (SRWconst [j] w) x:(MOVHBRstore [i-2] {s} p w0:(SRWconst [j-16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstore [i-2] {s} p w0 mem)
(MOVWBRstore [i] {s} p (SRDconst [32] w) x:(MOVWBRstore [i-4] {s} p w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVDBRstore [i-4] {s} p w mem)
(MOVWBRstore [i] {s} p (SRDconst [j] w) x:(MOVWBRstore [i-4] {s} p w0:(SRDconst [j-32] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVDBRstore [i-4] {s} p w0 mem)
(MOVBstoreidx [i] {s} p idx (SRDconst [8] w) x:(MOVBstoreidx [i-1] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstoreidx [i-1] {s} p idx w mem)
(MOVBstoreidx [i] {s} p idx (SRDconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx w0:(SRDconst [j-8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstoreidx [i-1] {s} p idx w0 mem)
(MOVBstoreidx [i] {s} p idx (SRWconst [8] w) x:(MOVBstoreidx [i-1] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstoreidx [i-1] {s} p idx w mem)
(MOVBstoreidx [i] {s} p idx (SRWconst [j] w) x:(MOVBstoreidx [i-1] {s} p idx w0:(SRWconst [j-8] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVHBRstoreidx [i-1] {s} p idx w0 mem)
(MOVHBRstoreidx [i] {s} p idx (SRDconst [16] w) x:(MOVHBRstoreidx [i-2] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstoreidx [i-2] {s} p idx w mem)
(MOVHBRstoreidx [i] {s} p idx (SRDconst [j] w) x:(MOVHBRstoreidx [i-2] {s} p idx w0:(SRDconst [j-16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstoreidx [i-2] {s} p idx w0 mem)
(MOVHBRstoreidx [i] {s} p idx (SRWconst [16] w) x:(MOVHBRstoreidx [i-2] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstoreidx [i-2] {s} p idx w mem)
(MOVHBRstoreidx [i] {s} p idx (SRWconst [j] w) x:(MOVHBRstoreidx [i-2] {s} p idx w0:(SRWconst [j-16] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVWBRstoreidx [i-2] {s} p idx w0 mem)
(MOVWBRstoreidx [i] {s} p idx (SRDconst [32] w) x:(MOVWBRstoreidx [i-4] {s} p idx w mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVDBRstoreidx [i-4] {s} p idx w mem)
(MOVWBRstoreidx [i] {s} p idx (SRDconst [j] w) x:(MOVWBRstoreidx [i-4] {s} p idx w0:(SRDconst [j-32] w) mem))
&& x.Uses == 1
&& clobber(x)
-> (MOVDBRstoreidx [i-4] {s} p idx w0 mem)
// Combining byte loads into larger (unaligned) loads.
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
// Big-endian loads
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
(ORW x1:(MOVBZload [i1] {s} p mem)
sh:(SLWconst [8] x0:(MOVBZload [i0] {s} p mem)))
&& i1 == i0+1
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
(OR x1:(MOVBZload [i1] {s} p mem)
sh:(SLDconst [8] x0:(MOVBZload [i0] {s} p mem)))
&& i1 == i0+1
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZload [i0] {s} p mem)
(ORW x1:(MOVHZload [i1] {s} p mem)
sh:(SLWconst [16] x0:(MOVHZload [i0] {s} p mem)))
&& i1 == i0+2
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
(OR x1:(MOVHZload [i1] {s} p mem)
sh:(SLDconst [16] x0:(MOVHZload [i0] {s} p mem)))
&& i1 == i0+2
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWZload [i0] {s} p mem)
(OR x1:(MOVWZload [i1] {s} p mem)
sh:(SLDconst [32] x0:(MOVWZload [i0] {s} p mem)))
&& i1 == i0+4
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVDload [i0] {s} p mem)
(ORW
s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
or:(ORW
s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
y))
&& i1 == i0+1
&& j1 == j0-8
&& j1 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
(OR
s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
or:(OR
s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
y))
&& i1 == i0+1
&& j1 == j0-8
&& j1 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZload [i0] {s} p mem)) y)
(OR
s0:(SLDconst [j0] x0:(MOVHZload [i0] {s} p mem))
or:(OR
s1:(SLDconst [j1] x1:(MOVHZload [i1] {s} p mem))
y))
&& i1 == i0+2
&& j1 == j0-16
&& j1 % 32 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& s1.Uses == 1
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(s1)
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZload [i0] {s} p mem)) y)
// Big-endian indexed loads
(ORW x1:(MOVBZloadidx [i1] {s} p idx mem)
sh:(SLWconst [8] x0:(MOVBZloadidx [i0] {s} p idx mem)))
&& i1 == i0+1
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZloadidx [i0] {s} p idx mem)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
(OR x1:(MOVBZloadidx [i1] {s} p idx mem)
sh:(SLDconst [8] x0:(MOVBZloadidx [i0] {s} p idx mem)))
&& i1 == i0+1
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZloadidx [i0] {s} p idx mem)
(ORW x1:(MOVHZloadidx [i1] {s} p idx mem)
sh:(SLWconst [16] x0:(MOVHZloadidx [i0] {s} p idx mem)))
&& i1 == i0+2
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWZloadidx [i0] {s} p idx mem)
(OR x1:(MOVHZloadidx [i1] {s} p idx mem)
sh:(SLDconst [16] x0:(MOVHZloadidx [i0] {s} p idx mem)))
&& i1 == i0+2
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWZloadidx [i0] {s} p idx mem)
(OR x1:(MOVWZloadidx [i1] {s} p idx mem)
sh:(SLDconst [32] x0:(MOVWZloadidx [i0] {s} p idx mem)))
&& i1 == i0+4
&& p.Op != OpSB
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVDloadidx [i0] {s} p idx mem)
(ORW
s0:(SLWconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
or:(ORW
s1:(SLWconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
y))
&& i1 == i0+1
&& j1 == j0-8
&& j1 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (ORW <v.Type> (SLWconst <v.Type> [j1] (MOVHZloadidx [i0] {s} p idx mem)) y)
(OR
s0:(SLDconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
or:(OR
s1:(SLDconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
y))
&& i1 == i0+1
&& j1 == j0-8
&& j1 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVHZloadidx [i0] {s} p idx mem)) y)
(OR
s0:(SLDconst [j0] x0:(MOVHZloadidx [i0] {s} p idx mem))
or:(OR
s1:(SLDconst [j1] x1:(MOVHZloadidx [i1] {s} p idx mem))
y))
&& i1 == i0+2
&& j1 == j0-16
&& j1 % 32 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& s1.Uses == 1
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(s1)
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j1] (MOVWZloadidx [i0] {s} p idx mem)) y)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
// Little-endian loads
(ORW x0:(MOVBZload [i0] {s} p mem)
sh:(SLWconst [8] x1:(MOVBZload [i1] {s} p mem)))
&& i1 == i0+1
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
(OR x0:(MOVBZload [i0] {s} p mem)
sh:(SLDconst [8] x1:(MOVBZload [i1] {s} p mem)))
&& i1 == i0+1
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRload [i0] {s} p mem))
(ORW r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
&& i1 == i0+2
&& x0.Uses == 1
&& x1.Uses == 1
&& r0.Uses == 1
&& r1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(r0)
&& clobber(r1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWBRload [i0] {s} p mem)
(OR r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem))
sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem))))
&& i1 == i0+2
&& x0.Uses == 1
&& x1.Uses == 1
&& r0.Uses == 1
&& r1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(r0)
&& clobber(r1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRload [i0] {s} p mem))
(OR r0:(MOVWZreg x0:(MOVWBRload [i0] {s} p mem))
sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRload [i1] {s} p mem))))
&& i1 == i0+4
&& x0.Uses == 1
&& x1.Uses == 1
&& r0.Uses == 1
&& r1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(r0)
&& clobber(r1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVDBRload [i0] {s} p mem)
(ORW
s1:(SLWconst [j1] x1:(MOVBZload [i1] {s} p mem))
or:(ORW
s0:(SLWconst [j0] x0:(MOVBZload [i0] {s} p mem))
y))
&& i1 == i0+1
&& j1 == j0+8
&& j0 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
(OR
s1:(SLDconst [j1] x1:(MOVBZload [i1] {s} p mem))
or:(OR
s0:(SLDconst [j0] x0:(MOVBZload [i0] {s} p mem))
y))
&& i1 == i0+1
&& j1 == j0+8
&& j0 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRload [i0] {s} p mem))) y)
(OR
s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRload [i1] {s} p mem)))
or:(OR
s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRload [i0] {s} p mem)))
y))
&& i1 == i0+2
&& j1 == j0+16
&& j0 % 32 == 0
&& x0.Uses == 1
&& x1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& r0.Uses == 1
&& r1.Uses == 1
&& s0.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& s1.Uses == 1
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(r0)
&& clobber(r1)
&& clobber(s0)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(s1)
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRload [i0] {s} p mem))) y)
// Little-endian indexed loads
(ORW x0:(MOVBZloadidx [i0] {s} p idx mem)
sh:(SLWconst [8] x1:(MOVBZloadidx [i1] {s} p idx mem)))
&& i1 == i0+1
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
(OR x0:(MOVBZloadidx [i0] {s} p idx mem)
sh:(SLDconst [8] x1:(MOVBZloadidx [i1] {s} p idx mem)))
&& i1 == i0+1
&& x0.Uses == 1
&& x1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))
(ORW r0:(MOVHZreg x0:(MOVHBRloadidx [i0] {s} p idx mem))
sh:(SLWconst [16] r1:(MOVHZreg x1:(MOVHBRloadidx [i1] {s} p idx mem))))
&& i1 == i0+2
&& x0.Uses == 1
&& x1.Uses == 1
&& r0.Uses == 1
&& r1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(r0)
&& clobber(r1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWBRloadidx [i0] {s} p idx mem)
(OR r0:(MOVHZreg x0:(MOVHBRloadidx [i0] {s} p idx mem))
sh:(SLDconst [16] r1:(MOVHZreg x1:(MOVHBRloadidx [i1] {s} p idx mem))))
&& i1 == i0+2
&& x0.Uses == 1
&& x1.Uses == 1
&& r0.Uses == 1
&& r1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(r0)
&& clobber(r1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVWZreg (MOVWBRloadidx [i0] {s} p idx mem))
(OR r0:(MOVWZreg x0:(MOVWBRloadidx [i0] {s} p idx mem))
sh:(SLDconst [32] r1:(MOVWZreg x1:(MOVWBRloadidx [i1] {s} p idx mem))))
&& i1 == i0+4
&& x0.Uses == 1
&& x1.Uses == 1
&& r0.Uses == 1
&& r1.Uses == 1
&& sh.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(r0)
&& clobber(r1)
&& clobber(sh)
-> @mergePoint(b,x0,x1) (MOVDBRloadidx [i0] {s} p idx mem)
(ORW
s1:(SLWconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
or:(ORW
s0:(SLWconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
y))
&& i1 == i0+1
&& j1 == j0+8
&& j0 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
&& clobber(or)
-> @mergePoint(b,x0,x1) (ORW <v.Type> (SLWconst <v.Type> [j0] (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))) y)
(OR
s1:(SLDconst [j1] x1:(MOVBZloadidx [i1] {s} p idx mem))
or:(OR
s0:(SLDconst [j0] x0:(MOVBZloadidx [i0] {s} p idx mem))
y))
&& i1 == i0+1
&& j1 == j0+8
&& j0 % 16 == 0
&& x0.Uses == 1
&& x1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVHZreg (MOVHBRloadidx [i0] {s} p idx mem))) y)
(OR
s1:(SLDconst [j1] r1:(MOVHZreg x1:(MOVHBRloadidx [i1] {s} p idx mem)))
or:(OR
s0:(SLDconst [j0] r0:(MOVHZreg x0:(MOVHBRloadidx [i0] {s} p idx mem)))
y))
&& i1 == i0+2
&& j1 == j0+16
&& j0 % 32 == 0
&& x0.Uses == 1
&& x1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& r0.Uses == 1
&& r1.Uses == 1
&& s0.Uses == 1
&& s1.Uses == 1
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& or.Uses == 1
&& mergePoint(b,x0,x1) != nil
&& clobber(x0)
&& clobber(x1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(r0)
&& clobber(r1)
&& clobber(s0)
&& clobber(s1)
cmd/compile: automatically handle commuting ops in rewrite rules We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. [Note to reviewers: check these carefully. Most of the other rule changes are trivial.] Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: I999b1307272e91965b66754576019dedcbe7527a Reviewed-on: https://go-review.googlesource.com/38666 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-25 15:05:42 -07:00
&& clobber(or)
-> @mergePoint(b,x0,x1) (OR <v.Type> (SLDconst <v.Type> [j0] (MOVWZreg (MOVWBRloadidx [i0] {s} p idx mem))) y)
// Combine stores into store multiples.
// 32-bit
(MOVWstore [i] {s} p w1 x:(MOVWstore [i-4] {s} p w0 mem))
&& p.Op != OpSB
&& x.Uses == 1
&& is20Bit(i-4)
&& clobber(x)
-> (STM2 [i-4] {s} p w0 w1 mem)
(MOVWstore [i] {s} p w2 x:(STM2 [i-8] {s} p w0 w1 mem))
&& x.Uses == 1
&& is20Bit(i-8)
&& clobber(x)
-> (STM3 [i-8] {s} p w0 w1 w2 mem)
(MOVWstore [i] {s} p w3 x:(STM3 [i-12] {s} p w0 w1 w2 mem))
&& x.Uses == 1
&& is20Bit(i-12)
&& clobber(x)
-> (STM4 [i-12] {s} p w0 w1 w2 w3 mem)
(STM2 [i] {s} p w2 w3 x:(STM2 [i-8] {s} p w0 w1 mem))
&& x.Uses == 1
&& is20Bit(i-8)
&& clobber(x)
-> (STM4 [i-8] {s} p w0 w1 w2 w3 mem)
// 64-bit
(MOVDstore [i] {s} p w1 x:(MOVDstore [i-8] {s} p w0 mem))
&& p.Op != OpSB
&& x.Uses == 1
&& is20Bit(i-8)
&& clobber(x)
-> (STMG2 [i-8] {s} p w0 w1 mem)
(MOVDstore [i] {s} p w2 x:(STMG2 [i-16] {s} p w0 w1 mem))
&& x.Uses == 1
&& is20Bit(i-16)
&& clobber(x)
-> (STMG3 [i-16] {s} p w0 w1 w2 mem)
(MOVDstore [i] {s} p w3 x:(STMG3 [i-24] {s} p w0 w1 w2 mem))
&& x.Uses == 1
&& is20Bit(i-24)
&& clobber(x)
-> (STMG4 [i-24] {s} p w0 w1 w2 w3 mem)
(STMG2 [i] {s} p w2 w3 x:(STMG2 [i-16] {s} p w0 w1 mem))
&& x.Uses == 1
&& is20Bit(i-16)
&& clobber(x)
-> (STMG4 [i-16] {s} p w0 w1 w2 w3 mem)
// Convert 32-bit store multiples into 64-bit stores.
(STM2 [i] {s} p (SRDconst [32] x) x mem) -> (MOVDstore [i] {s} p x mem)