2016-07-21 12:42:49 -04:00
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
2021-04-19 10:40:20 +08:00
//go:build ignore
2016-07-21 12:42:49 -04:00
// +build ignore
package main
import "strings"
// Notes:
// - Integer types live in the low portion of registers. Upper portions are junk.
// - Boolean types use the low-order byte of a register. 0=false, 1=true.
// Upper bytes are junk.
2017-01-07 08:23:11 -08:00
// - *const instructions may use a constant larger than the instruction can encode.
2016-07-21 12:42:49 -04:00
// In this case the assembler expands to multiple instructions and uses tmp
// register (R27).
// Suffixes encode the bit width of various instructions.
// D (double word) = 64 bit
// W (word) = 32 bit
// H (half word) = 16 bit
// HU = 16 bit unsigned
// B (byte) = 8 bit
// BU = 8 bit unsigned
// S (single) = 32 bit float
// D (double) = 64 bit float
// Note: registers not used in regalloc are not included in this list,
// so that regmask stays within int64
// Be careful when hand coding regmasks.
var regNamesARM64 = [ ] string {
"R0" ,
"R1" ,
"R2" ,
"R3" ,
"R4" ,
"R5" ,
"R6" ,
"R7" ,
"R8" ,
"R9" ,
"R10" ,
"R11" ,
"R12" ,
"R13" ,
"R14" ,
"R15" ,
"R16" ,
"R17" ,
2016-07-22 06:41:14 -04:00
"R18" , // platform register, not used
2016-07-21 12:42:49 -04:00
"R19" ,
"R20" ,
"R21" ,
"R22" ,
"R23" ,
"R24" ,
"R25" ,
"R26" ,
// R27 = REGTMP not used in regalloc
"g" , // aka R28
2016-07-22 06:41:14 -04:00
"R29" , // frame pointer, not used
2016-10-26 16:06:16 -04:00
"R30" , // aka REGLINK
"SP" , // aka R31
2016-07-21 12:42:49 -04:00
"F0" ,
"F1" ,
"F2" ,
"F3" ,
"F4" ,
"F5" ,
"F6" ,
"F7" ,
"F8" ,
"F9" ,
"F10" ,
"F11" ,
"F12" ,
"F13" ,
"F14" ,
"F15" ,
"F16" ,
"F17" ,
"F18" ,
"F19" ,
"F20" ,
"F21" ,
"F22" ,
"F23" ,
"F24" ,
"F25" ,
"F26" ,
"F27" ,
cmd/compile, runtime, etc: get rid of constant FP registers
On ARM64, MIPS64, and PPC64, some floating point registers were
reserved for constants 0, 1, 2, 0.5, etc. This CL removes them.
On ARM64, they are never used. On MIPS64 and PPC64, the only use
case is a multiplication-by-2 in the old backend of the compiler,
which is replaced with an addition.
Change-Id: I737cbf43283756e3408964fc88c567a938c57036
Reviewed-on: https://go-review.googlesource.com/28095
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-08-30 14:46:25 -04:00
"F28" ,
"F29" ,
"F30" ,
"F31" ,
2016-07-21 12:42:49 -04:00
2019-10-21 14:07:50 -04:00
// If you add registers, update asyncPreempt in runtime.
2016-07-21 12:42:49 -04:00
// pseudo-registers
"SB" ,
}
func init ( ) {
// Make map from reg names to reg integers.
if len ( regNamesARM64 ) > 64 {
panic ( "too many registers" )
}
num := map [ string ] int { }
for i , name := range regNamesARM64 {
num [ name ] = i
}
buildReg := func ( s string ) regMask {
m := regMask ( 0 )
for _ , r := range strings . Split ( s , " " ) {
if n , ok := num [ r ] ; ok {
m |= regMask ( 1 ) << uint ( n )
continue
}
panic ( "register " + r + " not found" )
}
return m
}
// Common individual register masks
var (
2016-10-26 16:06:16 -04:00
gp = buildReg ( "R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30" )
2016-07-21 12:42:49 -04:00
gpg = gp | buildReg ( "g" )
gpsp = gp | buildReg ( "SP" )
gpspg = gpg | buildReg ( "SP" )
gpspsbg = gpspg | buildReg ( "SB" )
cmd/compile, runtime, etc: get rid of constant FP registers
On ARM64, MIPS64, and PPC64, some floating point registers were
reserved for constants 0, 1, 2, 0.5, etc. This CL removes them.
On ARM64, they are never used. On MIPS64 and PPC64, the only use
case is a multiplication-by-2 in the old backend of the compiler,
which is replaced with an addition.
Change-Id: I737cbf43283756e3408964fc88c567a938c57036
Reviewed-on: https://go-review.googlesource.com/28095
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-08-30 14:46:25 -04:00
fp = buildReg ( "F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31" )
2016-08-04 06:57:34 -04:00
callerSave = gp | fp | buildReg ( "g" ) // runtime.setg (and anything calling it) may clobber g
2019-02-06 14:12:36 -08:00
r0 = buildReg ( "R0" )
r1 = buildReg ( "R1" )
r2 = buildReg ( "R2" )
r3 = buildReg ( "R3" )
2016-07-21 12:42:49 -04:00
)
// Common regInfo
var (
2019-01-14 09:36:18 +00:00
gp01 = regInfo { inputs : nil , outputs : [ ] regMask { gp } }
gp0flags1 = regInfo { inputs : [ ] regMask { 0 } , outputs : [ ] regMask { gp } }
gp11 = regInfo { inputs : [ ] regMask { gpg } , outputs : [ ] regMask { gp } }
gp11sp = regInfo { inputs : [ ] regMask { gpspg } , outputs : [ ] regMask { gp } }
gp1flags = regInfo { inputs : [ ] regMask { gpg } }
gp1flags1 = regInfo { inputs : [ ] regMask { gpg } , outputs : [ ] regMask { gp } }
gp11flags = regInfo { inputs : [ ] regMask { gpg } , outputs : [ ] regMask { gp , 0 } }
gp21 = regInfo { inputs : [ ] regMask { gpg , gpg } , outputs : [ ] regMask { gp } }
gp21nog = regInfo { inputs : [ ] regMask { gp , gp } , outputs : [ ] regMask { gp } }
2019-03-21 03:24:47 +00:00
gp21flags = regInfo { inputs : [ ] regMask { gp , gp } , outputs : [ ] regMask { gp , 0 } }
2019-01-14 09:36:18 +00:00
gp2flags = regInfo { inputs : [ ] regMask { gpg , gpg } }
gp2flags1 = regInfo { inputs : [ ] regMask { gp , gp } , outputs : [ ] regMask { gp } }
gp2flags1flags = regInfo { inputs : [ ] regMask { gp , gp , 0 } , outputs : [ ] regMask { gp , 0 } }
gp2load = regInfo { inputs : [ ] regMask { gpspsbg , gpg } , outputs : [ ] regMask { gp } }
gp22 = regInfo { inputs : [ ] regMask { gpg , gpg } , outputs : [ ] regMask { gp , gp } }
gp31 = regInfo { inputs : [ ] regMask { gpg , gpg , gpg } , outputs : [ ] regMask { gp } }
gpload = regInfo { inputs : [ ] regMask { gpspsbg } , outputs : [ ] regMask { gp } }
gpstore = regInfo { inputs : [ ] regMask { gpspsbg , gpg } }
gpstore0 = regInfo { inputs : [ ] regMask { gpspsbg } }
gpstore2 = regInfo { inputs : [ ] regMask { gpspsbg , gpg , gpg } }
gpxchg = regInfo { inputs : [ ] regMask { gpspsbg , gpg } , outputs : [ ] regMask { gp } }
gpcas = regInfo { inputs : [ ] regMask { gpspsbg , gpg , gpg } , outputs : [ ] regMask { gp } }
fp01 = regInfo { inputs : nil , outputs : [ ] regMask { fp } }
fp11 = regInfo { inputs : [ ] regMask { fp } , outputs : [ ] regMask { fp } }
fpgp = regInfo { inputs : [ ] regMask { fp } , outputs : [ ] regMask { gp } }
gpfp = regInfo { inputs : [ ] regMask { gp } , outputs : [ ] regMask { fp } }
fp21 = regInfo { inputs : [ ] regMask { fp , fp } , outputs : [ ] regMask { fp } }
fp31 = regInfo { inputs : [ ] regMask { fp , fp , fp } , outputs : [ ] regMask { fp } }
fp2flags = regInfo { inputs : [ ] regMask { fp , fp } }
fp1flags = regInfo { inputs : [ ] regMask { fp } }
fpload = regInfo { inputs : [ ] regMask { gpspsbg } , outputs : [ ] regMask { fp } }
fp2load = regInfo { inputs : [ ] regMask { gpspsbg , gpg } , outputs : [ ] regMask { fp } }
fpstore = regInfo { inputs : [ ] regMask { gpspsbg , fp } }
fpstore2 = regInfo { inputs : [ ] regMask { gpspsbg , gpg , fp } }
readflags = regInfo { inputs : nil , outputs : [ ] regMask { gp } }
2021-06-15 14:04:30 +00:00
prefreg = regInfo { inputs : [ ] regMask { gpspsbg } }
2016-07-21 12:42:49 -04:00
)
ops := [ ] opData {
// binary ops
2019-01-14 09:36:18 +00:00
{ name : "ADCSflags" , argLength : 3 , reg : gp2flags1flags , typ : "(UInt64,Flags)" , asm : "ADCS" , commutative : true } , // arg0+arg1+carry, set flags.
{ name : "ADCzerocarry" , argLength : 1 , reg : gp0flags1 , typ : "UInt64" , asm : "ADC" } , // ZR+ZR+carry
{ name : "ADD" , argLength : 2 , reg : gp21 , asm : "ADD" , commutative : true } , // arg0 + arg1
{ name : "ADDconst" , argLength : 1 , reg : gp11sp , asm : "ADD" , aux : "Int64" } , // arg0 + auxInt
2019-03-21 03:24:47 +00:00
{ name : "ADDSconstflags" , argLength : 1 , reg : gp11flags , typ : "(UInt64,Flags)" , asm : "ADDS" , aux : "Int64" } , // arg0+auxint, set flags.
{ name : "ADDSflags" , argLength : 2 , reg : gp21flags , typ : "(UInt64,Flags)" , asm : "ADDS" , commutative : true } , // arg0+arg1, set flags.
2019-01-14 09:36:18 +00:00
{ name : "SUB" , argLength : 2 , reg : gp21 , asm : "SUB" } , // arg0 - arg1
{ name : "SUBconst" , argLength : 1 , reg : gp11 , asm : "SUB" , aux : "Int64" } , // arg0 - auxInt
2019-03-20 12:46:20 +00:00
{ name : "SBCSflags" , argLength : 3 , reg : gp2flags1flags , typ : "(UInt64,Flags)" , asm : "SBCS" } , // arg0-(arg1+borrowing), set flags.
{ name : "SUBSflags" , argLength : 2 , reg : gp21flags , typ : "(UInt64,Flags)" , asm : "SUBS" } , // arg0 - arg1, set flags.
2019-01-14 09:36:18 +00:00
{ name : "MUL" , argLength : 2 , reg : gp21 , asm : "MUL" , commutative : true } , // arg0 * arg1
{ name : "MULW" , argLength : 2 , reg : gp21 , asm : "MULW" , commutative : true } , // arg0 * arg1, 32-bit
{ name : "MNEG" , argLength : 2 , reg : gp21 , asm : "MNEG" , commutative : true } , // -arg0 * arg1
{ name : "MNEGW" , argLength : 2 , reg : gp21 , asm : "MNEGW" , commutative : true } , // -arg0 * arg1, 32-bit
{ name : "MULH" , argLength : 2 , reg : gp21 , asm : "SMULH" , commutative : true } , // (arg0 * arg1) >> 64, signed
{ name : "UMULH" , argLength : 2 , reg : gp21 , asm : "UMULH" , commutative : true } , // (arg0 * arg1) >> 64, unsigned
{ name : "MULL" , argLength : 2 , reg : gp21 , asm : "SMULL" , commutative : true } , // arg0 * arg1, signed, 32-bit mult results in 64-bit
{ name : "UMULL" , argLength : 2 , reg : gp21 , asm : "UMULL" , commutative : true } , // arg0 * arg1, unsigned, 32-bit mult results in 64-bit
{ name : "DIV" , argLength : 2 , reg : gp21 , asm : "SDIV" } , // arg0 / arg1, signed
{ name : "UDIV" , argLength : 2 , reg : gp21 , asm : "UDIV" } , // arg0 / arg1, unsighed
{ name : "DIVW" , argLength : 2 , reg : gp21 , asm : "SDIVW" } , // arg0 / arg1, signed, 32 bit
{ name : "UDIVW" , argLength : 2 , reg : gp21 , asm : "UDIVW" } , // arg0 / arg1, unsighed, 32 bit
{ name : "MOD" , argLength : 2 , reg : gp21 , asm : "REM" } , // arg0 % arg1, signed
{ name : "UMOD" , argLength : 2 , reg : gp21 , asm : "UREM" } , // arg0 % arg1, unsigned
{ name : "MODW" , argLength : 2 , reg : gp21 , asm : "REMW" } , // arg0 % arg1, signed, 32 bit
{ name : "UMODW" , argLength : 2 , reg : gp21 , asm : "UREMW" } , // arg0 % arg1, unsigned, 32 bit
2016-07-21 12:42:49 -04:00
2018-02-07 12:24:41 +00:00
{ name : "FADDS" , argLength : 2 , reg : fp21 , asm : "FADDS" , commutative : true } , // arg0 + arg1
{ name : "FADDD" , argLength : 2 , reg : fp21 , asm : "FADDD" , commutative : true } , // arg0 + arg1
{ name : "FSUBS" , argLength : 2 , reg : fp21 , asm : "FSUBS" } , // arg0 - arg1
{ name : "FSUBD" , argLength : 2 , reg : fp21 , asm : "FSUBD" } , // arg0 - arg1
{ name : "FMULS" , argLength : 2 , reg : fp21 , asm : "FMULS" , commutative : true } , // arg0 * arg1
{ name : "FMULD" , argLength : 2 , reg : fp21 , asm : "FMULD" , commutative : true } , // arg0 * arg1
{ name : "FNMULS" , argLength : 2 , reg : fp21 , asm : "FNMULS" , commutative : true } , // -(arg0 * arg1)
{ name : "FNMULD" , argLength : 2 , reg : fp21 , asm : "FNMULD" , commutative : true } , // -(arg0 * arg1)
{ name : "FDIVS" , argLength : 2 , reg : fp21 , asm : "FDIVS" } , // arg0 / arg1
{ name : "FDIVD" , argLength : 2 , reg : fp21 , asm : "FDIVD" } , // arg0 / arg1
2016-07-21 12:42:49 -04:00
{ name : "AND" , argLength : 2 , reg : gp21 , asm : "AND" , commutative : true } , // arg0 & arg1
2016-07-22 06:41:14 -04:00
{ name : "ANDconst" , argLength : 1 , reg : gp11 , asm : "AND" , aux : "Int64" } , // arg0 & auxInt
2016-07-21 12:42:49 -04:00
{ name : "OR" , argLength : 2 , reg : gp21 , asm : "ORR" , commutative : true } , // arg0 | arg1
2016-07-22 06:41:14 -04:00
{ name : "ORconst" , argLength : 1 , reg : gp11 , asm : "ORR" , aux : "Int64" } , // arg0 | auxInt
2016-07-21 12:42:49 -04:00
{ name : "XOR" , argLength : 2 , reg : gp21 , asm : "EOR" , commutative : true } , // arg0 ^ arg1
2016-07-22 06:41:14 -04:00
{ name : "XORconst" , argLength : 1 , reg : gp11 , asm : "EOR" , aux : "Int64" } , // arg0 ^ auxInt
2016-07-21 12:42:49 -04:00
{ name : "BIC" , argLength : 2 , reg : gp21 , asm : "BIC" } , // arg0 &^ arg1
2018-02-25 09:10:54 +00:00
{ name : "EON" , argLength : 2 , reg : gp21 , asm : "EON" } , // arg0 ^ ^arg1
{ name : "ORN" , argLength : 2 , reg : gp21 , asm : "ORN" } , // arg0 | ^arg1
2016-07-21 12:42:49 -04:00
2018-04-02 16:22:08 -04:00
{ name : "LoweredMuluhilo" , argLength : 2 , reg : gp22 , resultNotInArgs : true } , // arg0 * arg1, returns (hi, lo)
2019-01-14 09:36:18 +00:00
2016-07-21 12:42:49 -04:00
// unary ops
2019-03-20 12:46:20 +00:00
{ name : "MVN" , argLength : 1 , reg : gp11 , asm : "MVN" } , // ^arg0
{ name : "NEG" , argLength : 1 , reg : gp11 , asm : "NEG" } , // -arg0
{ name : "NEGSflags" , argLength : 1 , reg : gp11flags , typ : "(UInt64,Flags)" , asm : "NEGS" } , // -arg0, set flags.
{ name : "NGCzerocarry" , argLength : 1 , reg : gp0flags1 , typ : "UInt64" , asm : "NGC" } , // -1 if borrowing, 0 otherwise.
{ name : "FABSD" , argLength : 1 , reg : fp11 , asm : "FABSD" } , // abs(arg0), float64
{ name : "FNEGS" , argLength : 1 , reg : fp11 , asm : "FNEGS" } , // -arg0, float32
{ name : "FNEGD" , argLength : 1 , reg : fp11 , asm : "FNEGD" } , // -arg0, float64
{ name : "FSQRTD" , argLength : 1 , reg : fp11 , asm : "FSQRTD" } , // sqrt(arg0), float64
2020-12-07 19:15:15 +08:00
{ name : "FSQRTS" , argLength : 1 , reg : fp11 , asm : "FSQRTS" } , // sqrt(arg0), float32
2019-03-20 12:46:20 +00:00
{ name : "REV" , argLength : 1 , reg : gp11 , asm : "REV" } , // byte reverse, 64-bit
{ name : "REVW" , argLength : 1 , reg : gp11 , asm : "REVW" } , // byte reverse, 32-bit
2020-05-20 08:49:59 +00:00
{ name : "REV16" , argLength : 1 , reg : gp11 , asm : "REV16" } , // byte reverse in each 16-bit halfword, 64-bit
2019-03-20 12:46:20 +00:00
{ name : "REV16W" , argLength : 1 , reg : gp11 , asm : "REV16W" } , // byte reverse in each 16-bit halfword, 32-bit
{ name : "RBIT" , argLength : 1 , reg : gp11 , asm : "RBIT" } , // bit reverse, 64-bit
{ name : "RBITW" , argLength : 1 , reg : gp11 , asm : "RBITW" } , // bit reverse, 32-bit
{ name : "CLZ" , argLength : 1 , reg : gp11 , asm : "CLZ" } , // count leading zero, 64-bit
{ name : "CLZW" , argLength : 1 , reg : gp11 , asm : "CLZW" } , // count leading zero, 32-bit
{ name : "VCNT" , argLength : 1 , reg : fp11 , asm : "VCNT" } , // count set bits for each 8-bit unit and store the result in each 8-bit unit
{ name : "VUADDLV" , argLength : 1 , reg : fp11 , asm : "VUADDLV" } , // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit.
2018-02-28 16:30:07 -05:00
{ name : "LoweredRound32F" , argLength : 1 , reg : fp11 , resultInArg0 : true , zeroWidth : true } ,
{ name : "LoweredRound64F" , argLength : 1 , reg : fp11 , resultInArg0 : true , zeroWidth : true } ,
2016-07-21 12:42:49 -04:00
2018-02-17 12:57:44 +00:00
// 3-operand, the addend comes first
{ name : "FMADDS" , argLength : 3 , reg : fp31 , asm : "FMADDS" } , // +arg0 + (arg1 * arg2)
{ name : "FMADDD" , argLength : 3 , reg : fp31 , asm : "FMADDD" } , // +arg0 + (arg1 * arg2)
{ name : "FNMADDS" , argLength : 3 , reg : fp31 , asm : "FNMADDS" } , // -arg0 - (arg1 * arg2)
{ name : "FNMADDD" , argLength : 3 , reg : fp31 , asm : "FNMADDD" } , // -arg0 - (arg1 * arg2)
{ name : "FMSUBS" , argLength : 3 , reg : fp31 , asm : "FMSUBS" } , // +arg0 - (arg1 * arg2)
{ name : "FMSUBD" , argLength : 3 , reg : fp31 , asm : "FMSUBD" } , // +arg0 - (arg1 * arg2)
{ name : "FNMSUBS" , argLength : 3 , reg : fp31 , asm : "FNMSUBS" } , // -arg0 + (arg1 * arg2)
{ name : "FNMSUBD" , argLength : 3 , reg : fp31 , asm : "FNMSUBD" } , // -arg0 + (arg1 * arg2)
2018-08-13 10:38:25 +00:00
{ name : "MADD" , argLength : 3 , reg : gp31 , asm : "MADD" } , // +arg0 + (arg1 * arg2)
{ name : "MADDW" , argLength : 3 , reg : gp31 , asm : "MADDW" } , // +arg0 + (arg1 * arg2), 32-bit
{ name : "MSUB" , argLength : 3 , reg : gp31 , asm : "MSUB" } , // +arg0 - (arg1 * arg2)
{ name : "MSUBW" , argLength : 3 , reg : gp31 , asm : "MSUBW" } , // +arg0 - (arg1 * arg2), 32-bit
2018-02-17 12:57:44 +00:00
2016-07-22 06:41:14 -04:00
// shifts
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
{ name : "SLL" , argLength : 2 , reg : gp21 , asm : "LSL" } , // arg0 << arg1, shift amount is mod 64
2021-04-19 10:40:20 +08:00
{ name : "SLLconst" , argLength : 1 , reg : gp11 , asm : "LSL" , aux : "Int64" } , // arg0 << auxInt, auxInt should be in the range 0 to 63.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
{ name : "SRL" , argLength : 2 , reg : gp21 , asm : "LSR" } , // arg0 >> arg1, unsigned, shift amount is mod 64
2021-04-19 10:40:20 +08:00
{ name : "SRLconst" , argLength : 1 , reg : gp11 , asm : "LSR" , aux : "Int64" } , // arg0 >> auxInt, unsigned, auxInt should be in the range 0 to 63.
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
{ name : "SRA" , argLength : 2 , reg : gp21 , asm : "ASR" } , // arg0 >> arg1, signed, shift amount is mod 64
2021-04-19 10:40:20 +08:00
{ name : "SRAconst" , argLength : 1 , reg : gp11 , asm : "ASR" , aux : "Int64" } , // arg0 >> auxInt, signed, auxInt should be in the range 0 to 63.
2018-06-30 06:48:51 +00:00
{ name : "ROR" , argLength : 2 , reg : gp21 , asm : "ROR" } , // arg0 right rotate by (arg1 mod 64) bits
{ name : "RORW" , argLength : 2 , reg : gp21 , asm : "RORW" } , // arg0 right rotate by (arg1 mod 32) bits
2021-04-19 10:40:20 +08:00
{ name : "RORconst" , argLength : 1 , reg : gp11 , asm : "ROR" , aux : "Int64" } , // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63.
{ name : "RORWconst" , argLength : 1 , reg : gp11 , asm : "RORW" , aux : "Int64" } , // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31.
{ name : "EXTRconst" , argLength : 2 , reg : gp21 , asm : "EXTR" , aux : "Int64" } , // extract 64 bits from arg0:arg1 starting at lsb auxInt, auxInt should be in the range 0 to 63.
{ name : "EXTRWconst" , argLength : 2 , reg : gp21 , asm : "EXTRW" , aux : "Int64" } , // extract 32 bits from arg0[31:0]:arg1[31:0] starting at lsb auxInt and zero top 32 bits, auxInt should be in the range 0 to 31.
2016-07-22 06:41:14 -04:00
2016-07-21 12:42:49 -04:00
// comparisons
{ name : "CMP" , argLength : 2 , reg : gp2flags , asm : "CMP" , typ : "Flags" } , // arg0 compare to arg1
2016-07-22 06:41:14 -04:00
{ name : "CMPconst" , argLength : 1 , reg : gp1flags , asm : "CMP" , aux : "Int64" , typ : "Flags" } , // arg0 compare to auxInt
2016-07-21 12:42:49 -04:00
{ name : "CMPW" , argLength : 2 , reg : gp2flags , asm : "CMPW" , typ : "Flags" } , // arg0 compare to arg1, 32 bit
{ name : "CMPWconst" , argLength : 1 , reg : gp1flags , asm : "CMPW" , aux : "Int32" , typ : "Flags" } , // arg0 compare to auxInt, 32 bit
2018-07-19 08:09:13 +00:00
{ name : "CMN" , argLength : 2 , reg : gp2flags , asm : "CMN" , typ : "Flags" , commutative : true } , // arg0 compare to -arg1
2016-07-22 06:41:14 -04:00
{ name : "CMNconst" , argLength : 1 , reg : gp1flags , asm : "CMN" , aux : "Int64" , typ : "Flags" } , // arg0 compare to -auxInt
2018-07-19 08:09:13 +00:00
{ name : "CMNW" , argLength : 2 , reg : gp2flags , asm : "CMNW" , typ : "Flags" , commutative : true } , // arg0 compare to -arg1, 32 bit
2016-07-21 12:42:49 -04:00
{ name : "CMNWconst" , argLength : 1 , reg : gp1flags , asm : "CMNW" , aux : "Int32" , typ : "Flags" } , // arg0 compare to -auxInt, 32 bit
2018-07-19 08:09:13 +00:00
{ name : "TST" , argLength : 2 , reg : gp2flags , asm : "TST" , typ : "Flags" , commutative : true } , // arg0 & arg1 compare to 0
2018-04-24 07:17:40 -04:00
{ name : "TSTconst" , argLength : 1 , reg : gp1flags , asm : "TST" , aux : "Int64" , typ : "Flags" } , // arg0 & auxInt compare to 0
2018-07-19 08:09:13 +00:00
{ name : "TSTW" , argLength : 2 , reg : gp2flags , asm : "TSTW" , typ : "Flags" , commutative : true } , // arg0 & arg1 compare to 0, 32 bit
2018-04-24 07:17:40 -04:00
{ name : "TSTWconst" , argLength : 1 , reg : gp1flags , asm : "TSTW" , aux : "Int32" , typ : "Flags" } , // arg0 & auxInt compare to 0, 32 bit
2016-07-21 12:42:49 -04:00
{ name : "FCMPS" , argLength : 2 , reg : fp2flags , asm : "FCMPS" , typ : "Flags" } , // arg0 compare to arg1, float32
{ name : "FCMPD" , argLength : 2 , reg : fp2flags , asm : "FCMPD" , typ : "Flags" } , // arg0 compare to arg1, float64
cmd/compile: optimize arm64 comparison of x and 0.0 with "FCMP $(0.0), Fn"
Code:
func comp(x float64) bool {return x < 0}
Previous version:
FMOVD "".x(FP), F0
FMOVD ZR, F1
FCMPD F1, F0
CSET MI, R0
MOVB R0, "".~r1+8(FP)
RET (R30)
Optimized version:
FMOVD "".x(FP), F0
FCMPD $(0.0), F0
CSET MI, R0
MOVB R0, "".~r1+8(FP)
RET (R30)
Math package benchmark results:
name old time/op new time/op delta
Acos-8 77.500000ns +- 0% 77.400000ns +- 0% -0.13% (p=0.000 n=9+10)
Acosh-8 98.600000ns +- 0% 98.100000ns +- 0% -0.51% (p=0.000 n=10+9)
Asin-8 67.600000ns +- 0% 66.600000ns +- 0% -1.48% (p=0.000 n=9+10)
Asinh-8 108.000000ns +- 0% 109.000000ns +- 0% +0.93% (p=0.000 n=10+10)
Atan-8 36.788889ns +- 0% 36.000000ns +- 0% -2.14% (p=0.000 n=9+10)
Atanh-8 104.000000ns +- 0% 105.000000ns +- 0% +0.96% (p=0.000 n=10+10)
Atan2-8 67.100000ns +- 0% 66.600000ns +- 0% -0.75% (p=0.000 n=10+10)
Cbrt-8 89.100000ns +- 0% 82.000000ns +- 0% -7.97% (p=0.000 n=10+10)
Erf-8 43.500000ns +- 0% 43.000000ns +- 0% -1.15% (p=0.000 n=10+10)
Erfc-8 49.000000ns +- 0% 48.220000ns +- 0% -1.59% (p=0.000 n=9+10)
Erfinv-8 59.100000ns +- 0% 58.600000ns +- 0% -0.85% (p=0.000 n=10+10)
Erfcinv-8 59.100000ns +- 0% 58.600000ns +- 0% -0.85% (p=0.000 n=10+10)
Expm1-8 56.600000ns +- 0% 56.040000ns +- 0% -0.99% (p=0.000 n=8+10)
Exp2Go-8 97.600000ns +- 0% 99.400000ns +- 0% +1.84% (p=0.000 n=10+10)
Dim-8 2.500000ns +- 0% 2.250000ns +- 0% -10.00% (p=0.000 n=10+10)
Mod-8 108.000000ns +- 0% 106.000000ns +- 0% -1.85% (p=0.000 n=8+8)
Frexp-8 12.000000ns +- 0% 12.500000ns +- 0% +4.17% (p=0.000 n=10+10)
Gamma-8 67.100000ns +- 0% 67.600000ns +- 0% +0.75% (p=0.000 n=10+10)
Hypot-8 17.100000ns +- 0% 17.000000ns +- 0% -0.58% (p=0.002 n=8+10)
Ilogb-8 9.010000ns +- 0% 8.510000ns +- 0% -5.55% (p=0.000 n=10+9)
J1-8 288.000000ns +- 0% 287.000000ns +- 0% -0.35% (p=0.000 n=10+10)
Jn-8 605.000000ns +- 0% 604.000000ns +- 0% -0.17% (p=0.001 n=8+9)
Logb-8 10.600000ns +- 0% 10.500000ns +- 0% -0.94% (p=0.000 n=9+10)
Log2-8 16.500000ns +- 0% 17.000000ns +- 0% +3.03% (p=0.000 n=10+10)
PowFrac-8 232.000000ns +- 0% 233.000000ns +- 0% +0.43% (p=0.000 n=10+10)
Remainder-8 70.600000ns +- 0% 69.600000ns +- 0% -1.42% (p=0.000 n=10+10)
SqrtGoLatency-8 77.600000ns +- 0% 76.600000ns +- 0% -1.29% (p=0.000 n=10+10)
Tanh-8 97.600000ns +- 0% 94.100000ns +- 0% -3.59% (p=0.000 n=10+10)
Y1-8 289.000000ns +- 0% 288.000000ns +- 0% -0.35% (p=0.000 n=10+10)
Yn-8 603.000000ns +- 0% 589.000000ns +- 0% -2.32% (p=0.000 n=10+10)
Change-Id: I6920734f8662b329aa58f5b8e4eeae73b409984d
Reviewed-on: https://go-review.googlesource.com/c/go/+/164719
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-02-15 11:21:46 +00:00
{ name : "FCMPS0" , argLength : 1 , reg : fp1flags , asm : "FCMPS" , typ : "Flags" } , // arg0 compare to 0, float32
{ name : "FCMPD0" , argLength : 1 , reg : fp1flags , asm : "FCMPD" , typ : "Flags" } , // arg0 compare to 0, float64
2016-07-21 12:42:49 -04:00
2016-08-10 13:24:03 -04:00
// shifted ops
2021-04-19 10:40:20 +08:00
{ name : "MVNshiftLL" , argLength : 1 , reg : gp11 , asm : "MVN" , aux : "Int64" } , // ^(arg0<<auxInt), auxInt should be in the range 0 to 63.
{ name : "MVNshiftRL" , argLength : 1 , reg : gp11 , asm : "MVN" , aux : "Int64" } , // ^(arg0>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
{ name : "MVNshiftRA" , argLength : 1 , reg : gp11 , asm : "MVN" , aux : "Int64" } , // ^(arg0>>auxInt), signed shift, auxInt should be in the range 0 to 63.
{ name : "NEGshiftLL" , argLength : 1 , reg : gp11 , asm : "NEG" , aux : "Int64" } , // -(arg0<<auxInt), auxInt should be in the range 0 to 63.
{ name : "NEGshiftRL" , argLength : 1 , reg : gp11 , asm : "NEG" , aux : "Int64" } , // -(arg0>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
{ name : "NEGshiftRA" , argLength : 1 , reg : gp11 , asm : "NEG" , aux : "Int64" } , // -(arg0>>auxInt), signed shift, auxInt should be in the range 0 to 63.
{ name : "ADDshiftLL" , argLength : 2 , reg : gp21 , asm : "ADD" , aux : "Int64" } , // arg0 + arg1<<auxInt, auxInt should be in the range 0 to 63.
{ name : "ADDshiftRL" , argLength : 2 , reg : gp21 , asm : "ADD" , aux : "Int64" } , // arg0 + arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "ADDshiftRA" , argLength : 2 , reg : gp21 , asm : "ADD" , aux : "Int64" } , // arg0 + arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
{ name : "SUBshiftLL" , argLength : 2 , reg : gp21 , asm : "SUB" , aux : "Int64" } , // arg0 - arg1<<auxInt, auxInt should be in the range 0 to 63.
{ name : "SUBshiftRL" , argLength : 2 , reg : gp21 , asm : "SUB" , aux : "Int64" } , // arg0 - arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "SUBshiftRA" , argLength : 2 , reg : gp21 , asm : "SUB" , aux : "Int64" } , // arg0 - arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
{ name : "ANDshiftLL" , argLength : 2 , reg : gp21 , asm : "AND" , aux : "Int64" } , // arg0 & (arg1<<auxInt), auxInt should be in the range 0 to 63.
{ name : "ANDshiftRL" , argLength : 2 , reg : gp21 , asm : "AND" , aux : "Int64" } , // arg0 & (arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
{ name : "ANDshiftRA" , argLength : 2 , reg : gp21 , asm : "AND" , aux : "Int64" } , // arg0 & (arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
{ name : "ORshiftLL" , argLength : 2 , reg : gp21 , asm : "ORR" , aux : "Int64" } , // arg0 | arg1<<auxInt, auxInt should be in the range 0 to 63.
{ name : "ORshiftRL" , argLength : 2 , reg : gp21 , asm : "ORR" , aux : "Int64" } , // arg0 | arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "ORshiftRA" , argLength : 2 , reg : gp21 , asm : "ORR" , aux : "Int64" } , // arg0 | arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
{ name : "XORshiftLL" , argLength : 2 , reg : gp21 , asm : "EOR" , aux : "Int64" } , // arg0 ^ arg1<<auxInt, auxInt should be in the range 0 to 63.
{ name : "XORshiftRL" , argLength : 2 , reg : gp21 , asm : "EOR" , aux : "Int64" } , // arg0 ^ arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "XORshiftRA" , argLength : 2 , reg : gp21 , asm : "EOR" , aux : "Int64" } , // arg0 ^ arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
{ name : "BICshiftLL" , argLength : 2 , reg : gp21 , asm : "BIC" , aux : "Int64" } , // arg0 &^ (arg1<<auxInt), auxInt should be in the range 0 to 63.
{ name : "BICshiftRL" , argLength : 2 , reg : gp21 , asm : "BIC" , aux : "Int64" } , // arg0 &^ (arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
{ name : "BICshiftRA" , argLength : 2 , reg : gp21 , asm : "BIC" , aux : "Int64" } , // arg0 &^ (arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
{ name : "EONshiftLL" , argLength : 2 , reg : gp21 , asm : "EON" , aux : "Int64" } , // arg0 ^ ^(arg1<<auxInt), auxInt should be in the range 0 to 63.
{ name : "EONshiftRL" , argLength : 2 , reg : gp21 , asm : "EON" , aux : "Int64" } , // arg0 ^ ^(arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
{ name : "EONshiftRA" , argLength : 2 , reg : gp21 , asm : "EON" , aux : "Int64" } , // arg0 ^ ^(arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
{ name : "ORNshiftLL" , argLength : 2 , reg : gp21 , asm : "ORN" , aux : "Int64" } , // arg0 | ^(arg1<<auxInt), auxInt should be in the range 0 to 63.
{ name : "ORNshiftRL" , argLength : 2 , reg : gp21 , asm : "ORN" , aux : "Int64" } , // arg0 | ^(arg1>>auxInt), unsigned shift, auxInt should be in the range 0 to 63.
{ name : "ORNshiftRA" , argLength : 2 , reg : gp21 , asm : "ORN" , aux : "Int64" } , // arg0 | ^(arg1>>auxInt), signed shift, auxInt should be in the range 0 to 63.
{ name : "CMPshiftLL" , argLength : 2 , reg : gp2flags , asm : "CMP" , aux : "Int64" , typ : "Flags" } , // arg0 compare to arg1<<auxInt, auxInt should be in the range 0 to 63.
{ name : "CMPshiftRL" , argLength : 2 , reg : gp2flags , asm : "CMP" , aux : "Int64" , typ : "Flags" } , // arg0 compare to arg1>>auxInt, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "CMPshiftRA" , argLength : 2 , reg : gp2flags , asm : "CMP" , aux : "Int64" , typ : "Flags" } , // arg0 compare to arg1>>auxInt, signed shift, auxInt should be in the range 0 to 63.
{ name : "CMNshiftLL" , argLength : 2 , reg : gp2flags , asm : "CMN" , aux : "Int64" , typ : "Flags" } , // (arg0 + arg1<<auxInt) compare to 0, auxInt should be in the range 0 to 63.
{ name : "CMNshiftRL" , argLength : 2 , reg : gp2flags , asm : "CMN" , aux : "Int64" , typ : "Flags" } , // (arg0 + arg1>>auxInt) compare to 0, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "CMNshiftRA" , argLength : 2 , reg : gp2flags , asm : "CMN" , aux : "Int64" , typ : "Flags" } , // (arg0 + arg1>>auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63.
{ name : "TSTshiftLL" , argLength : 2 , reg : gp2flags , asm : "TST" , aux : "Int64" , typ : "Flags" } , // (arg0 & arg1<<auxInt) compare to 0, auxInt should be in the range 0 to 63.
{ name : "TSTshiftRL" , argLength : 2 , reg : gp2flags , asm : "TST" , aux : "Int64" , typ : "Flags" } , // (arg0 & arg1>>auxInt) compare to 0, unsigned shift, auxInt should be in the range 0 to 63.
{ name : "TSTshiftRA" , argLength : 2 , reg : gp2flags , asm : "TST" , aux : "Int64" , typ : "Flags" } , // (arg0 & arg1>>auxInt) compare to 0, signed shift, auxInt should be in the range 0 to 63.
2016-08-10 13:24:03 -04:00
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
// bitfield ops
// for all bitfield ops lsb is auxInt>>8, width is auxInt&0xff
// insert low width bits of arg1 into the result starting at bit lsb, copy other bits from arg0
2020-02-17 17:47:34 -08:00
{ name : "BFI" , argLength : 2 , reg : gp21nog , asm : "BFI" , aux : "ARM64BitField" , resultInArg0 : true } ,
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
// extract width bits of arg1 starting at bit lsb and insert at low end of result, copy other bits from arg0
2020-02-17 17:47:34 -08:00
{ name : "BFXIL" , argLength : 2 , reg : gp21nog , asm : "BFXIL" , aux : "ARM64BitField" , resultInArg0 : true } ,
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
// insert low width bits of arg0 into the result starting at bit lsb, bits to the left of the inserted bit field are set to the high/sign bit of the inserted bit field, bits to the right are zeroed
2020-02-17 17:47:34 -08:00
{ name : "SBFIZ" , argLength : 1 , reg : gp11 , asm : "SBFIZ" , aux : "ARM64BitField" } ,
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
// extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are set to the high/sign bit of the extracted bitfield
2020-02-17 17:47:34 -08:00
{ name : "SBFX" , argLength : 1 , reg : gp11 , asm : "SBFX" , aux : "ARM64BitField" } ,
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
// insert low width bits of arg0 into the result starting at bit lsb, bits to the left and right of the inserted bit field are zeroed
2020-02-17 17:47:34 -08:00
{ name : "UBFIZ" , argLength : 1 , reg : gp11 , asm : "UBFIZ" , aux : "ARM64BitField" } ,
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
// extract width bits of arg0 starting at bit lsb and insert at low end of result, remaining high bits are zeroed
2020-02-17 17:47:34 -08:00
{ name : "UBFX" , argLength : 1 , reg : gp11 , asm : "UBFX" , aux : "ARM64BitField" } ,
cmd/compile/internal/ssa: add patterns for arm64 bitfield opcodes
Add patterns to match common idioms for EXTR, BFI, BFXIL, SBFIZ, SBFX,
UBFIZ and UBFX opcodes.
go1 benchmarks results on Amberwing:
name old time/op new time/op delta
FmtManyArgs 786ns ± 2% 714ns ± 1% -9.20% (p=0.000 n=10+10)
Gzip 437ms ± 0% 402ms ± 0% -7.99% (p=0.000 n=10+10)
FmtFprintfIntInt 196ns ± 0% 182ns ± 0% -7.28% (p=0.000 n=10+9)
FmtFprintfPrefixedInt 207ns ± 0% 199ns ± 0% -3.86% (p=0.000 n=10+10)
FmtFprintfFloat 324ns ± 0% 316ns ± 0% -2.47% (p=0.000 n=10+8)
FmtFprintfInt 119ns ± 0% 117ns ± 0% -1.68% (p=0.000 n=10+9)
GobDecode 12.8ms ± 2% 12.6ms ± 1% -1.62% (p=0.002 n=10+10)
JSONDecode 94.4ms ± 1% 93.4ms ± 0% -1.10% (p=0.000 n=10+10)
RegexpMatchEasy0_32 247ns ± 0% 245ns ± 0% -0.65% (p=0.000 n=10+10)
RegexpMatchMedium_32 314ns ± 0% 312ns ± 0% -0.64% (p=0.000 n=10+10)
RegexpMatchEasy0_1K 541ns ± 0% 538ns ± 0% -0.55% (p=0.000 n=10+9)
TimeParse 450ns ± 1% 448ns ± 1% -0.42% (p=0.035 n=9+9)
RegexpMatchEasy1_32 244ns ± 0% 243ns ± 0% -0.41% (p=0.000 n=10+10)
GoParse 6.03ms ± 0% 6.00ms ± 0% -0.40% (p=0.002 n=10+10)
RegexpMatchEasy1_1K 779ns ± 0% 777ns ± 0% -0.26% (p=0.000 n=10+10)
RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 1% -0.06% (p=0.026 n=9+9)
BinaryTree17 11.7s ± 0% 11.6s ± 0% ~ (p=0.089 n=10+10)
HTTPClientServer 89.1µs ± 1% 89.5µs ± 2% ~ (p=0.436 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 79.5µs ± 2% ~ (p=0.469 n=10+10)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
GobEncode 12.0ms ± 1% 12.1ms ± 0% ~ (p=0.075 n=10+10)
Revcomp 669ms ± 0% 668ms ± 0% ~ (p=0.091 n=7+9)
Mandelbrot200 5.35ms ± 0% 5.36ms ± 0% +0.07% (p=0.000 n=9+9)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% +0.10% (p=0.000 n=9+9)
Fannkuch11 3.25s ± 0% 3.26s ± 0% +0.36% (p=0.000 n=9+10)
FmtFprintfString 114ns ± 1% 115ns ± 0% +0.52% (p=0.011 n=10+10)
JSONEncode 20.2ms ± 0% 20.3ms ± 0% +0.65% (p=0.000 n=10+10)
Template 91.3ms ± 0% 92.3ms ± 0% +1.08% (p=0.000 n=10+10)
TimeFormat 484ns ± 0% 495ns ± 1% +2.30% (p=0.000 n=9+10)
There are some opportunities to improve this change further by adding
patterns to match the "extended register" versions of ADD/SUB/CMP, but I
think that should be evaluated on its own. The regressions in Template
and TimeFormat would likely be recovered by this, as they seem to be due
to generating:
ubfiz x0, x0, #3, #8
add x1, x2, x0
instead of
add x1, x2, x0, lsl #3
Change-Id: I5644a8d70ac7a98e784a377a2b76ab47a3415a4b
Reviewed-on: https://go-review.googlesource.com/88355
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-21 16:15:39 -05:00
2016-07-21 12:42:49 -04:00
// moves
2020-04-30 12:41:28 +02:00
{ name : "MOVDconst" , argLength : 0 , reg : gp01 , aux : "Int64" , asm : "MOVD" , typ : "UInt64" , rematerializeable : true } , // 64 bits from auxint
2016-07-21 12:42:49 -04:00
{ name : "FMOVSconst" , argLength : 0 , reg : fp01 , aux : "Float64" , asm : "FMOVS" , typ : "Float32" , rematerializeable : true } , // auxint as 64-bit float, convert to 32-bit float
{ name : "FMOVDconst" , argLength : 0 , reg : fp01 , aux : "Float64" , asm : "FMOVD" , typ : "Float64" , rematerializeable : true } , // auxint as 64-bit float
2017-03-09 14:46:43 -08:00
{ name : "MOVDaddr" , argLength : 1 , reg : regInfo { inputs : [ ] regMask { buildReg ( "SP" ) | buildReg ( "SB" ) } , outputs : [ ] regMask { gp } } , aux : "SymOff" , asm : "MOVD" , rematerializeable : true , symEffect : "Addr" } , // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
2016-07-21 12:42:49 -04:00
2017-03-09 14:46:43 -08:00
{ name : "MOVBload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVB" , typ : "Int8" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "MOVBUload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVBU" , typ : "UInt8" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "MOVHload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVH" , typ : "Int16" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "MOVHUload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVHU" , typ : "UInt16" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "MOVWload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVW" , typ : "Int32" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "MOVWUload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVWU" , typ : "UInt32" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "MOVDload" , argLength : 2 , reg : gpload , aux : "SymOff" , asm : "MOVD" , typ : "UInt64" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "FMOVSload" , argLength : 2 , reg : fpload , aux : "SymOff" , asm : "FMOVS" , typ : "Float32" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
{ name : "FMOVDload" , argLength : 2 , reg : fpload , aux : "SymOff" , asm : "FMOVD" , typ : "Float64" , faultOnNilArg0 : true , symEffect : "Read" } , // load from arg0 + auxInt + aux. arg1=mem.
2016-09-13 17:01:01 -07:00
2018-04-16 14:04:26 +00:00
// register indexed load
2018-07-18 09:31:35 +00:00
{ name : "MOVDloadidx" , argLength : 3 , reg : gp2load , asm : "MOVD" , typ : "UInt64" } , // load 64-bit dword from arg0 + arg1, arg2 = mem.
{ name : "MOVWloadidx" , argLength : 3 , reg : gp2load , asm : "MOVW" , typ : "Int32" } , // load 32-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
{ name : "MOVWUloadidx" , argLength : 3 , reg : gp2load , asm : "MOVWU" , typ : "UInt32" } , // load 32-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
{ name : "MOVHloadidx" , argLength : 3 , reg : gp2load , asm : "MOVH" , typ : "Int16" } , // load 16-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
{ name : "MOVHUloadidx" , argLength : 3 , reg : gp2load , asm : "MOVHU" , typ : "UInt16" } , // load 16-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
{ name : "MOVBloadidx" , argLength : 3 , reg : gp2load , asm : "MOVB" , typ : "Int8" } , // load 8-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
{ name : "MOVBUloadidx" , argLength : 3 , reg : gp2load , asm : "MOVBU" , typ : "UInt8" } , // load 8-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
{ name : "FMOVSloadidx" , argLength : 3 , reg : fp2load , asm : "FMOVS" , typ : "Float32" } , // load 32-bit float from arg0 + arg1, arg2=mem.
{ name : "FMOVDloadidx" , argLength : 3 , reg : fp2load , asm : "FMOVD" , typ : "Float64" } , // load 64-bit float from arg0 + arg1, arg2=mem.
2018-04-22 00:51:00 +00:00
// shifted register indexed load
2020-11-27 17:10:33 +02:00
{ name : "MOVHloadidx2" , argLength : 3 , reg : gp2load , asm : "MOVH" , typ : "Int16" } , // load 16-bit half-word from arg0 + arg1*2, sign-extended to 64-bit, arg2=mem.
{ name : "MOVHUloadidx2" , argLength : 3 , reg : gp2load , asm : "MOVHU" , typ : "UInt16" } , // load 16-bit half-word from arg0 + arg1*2, zero-extended to 64-bit, arg2=mem.
{ name : "MOVWloadidx4" , argLength : 3 , reg : gp2load , asm : "MOVW" , typ : "Int32" } , // load 32-bit word from arg0 + arg1*4, sign-extended to 64-bit, arg2=mem.
{ name : "MOVWUloadidx4" , argLength : 3 , reg : gp2load , asm : "MOVWU" , typ : "UInt32" } , // load 32-bit word from arg0 + arg1*4, zero-extended to 64-bit, arg2=mem.
{ name : "MOVDloadidx8" , argLength : 3 , reg : gp2load , asm : "MOVD" , typ : "UInt64" } , // load 64-bit double-word from arg0 + arg1*8, arg2 = mem.
{ name : "FMOVSloadidx4" , argLength : 3 , reg : fp2load , asm : "FMOVS" , typ : "Float32" } , // load 32-bit float from arg0 + arg1*4, arg2 = mem.
{ name : "FMOVDloadidx8" , argLength : 3 , reg : fp2load , asm : "FMOVD" , typ : "Float64" } , // load 64-bit float from arg0 + arg1*8, arg2 = mem.
2018-04-16 14:04:26 +00:00
2017-03-09 14:46:43 -08:00
{ name : "MOVBstore" , argLength : 3 , reg : gpstore , aux : "SymOff" , asm : "MOVB" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
{ name : "MOVHstore" , argLength : 3 , reg : gpstore , aux : "SymOff" , asm : "MOVH" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{ name : "MOVWstore" , argLength : 3 , reg : gpstore , aux : "SymOff" , asm : "MOVW" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{ name : "MOVDstore" , argLength : 3 , reg : gpstore , aux : "SymOff" , asm : "MOVD" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
2017-07-27 01:55:03 +00:00
{ name : "STP" , argLength : 4 , reg : gpstore2 , aux : "SymOff" , asm : "STP" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 16 bytes of arg1 and arg2 to arg0 + auxInt + aux. arg3=mem.
2017-03-09 14:46:43 -08:00
{ name : "FMOVSstore" , argLength : 3 , reg : fpstore , aux : "SymOff" , asm : "FMOVS" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{ name : "FMOVDstore" , argLength : 3 , reg : fpstore , aux : "SymOff" , asm : "FMOVD" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
2016-09-13 17:01:01 -07:00
2018-04-16 14:04:26 +00:00
// register indexed store
2018-07-18 09:31:35 +00:00
{ name : "MOVBstoreidx" , argLength : 4 , reg : gpstore2 , asm : "MOVB" , typ : "Mem" } , // store 1 byte of arg2 to arg0 + arg1, arg3 = mem.
{ name : "MOVHstoreidx" , argLength : 4 , reg : gpstore2 , asm : "MOVH" , typ : "Mem" } , // store 2 bytes of arg2 to arg0 + arg1, arg3 = mem.
{ name : "MOVWstoreidx" , argLength : 4 , reg : gpstore2 , asm : "MOVW" , typ : "Mem" } , // store 4 bytes of arg2 to arg0 + arg1, arg3 = mem.
{ name : "MOVDstoreidx" , argLength : 4 , reg : gpstore2 , asm : "MOVD" , typ : "Mem" } , // store 8 bytes of arg2 to arg0 + arg1, arg3 = mem.
{ name : "FMOVSstoreidx" , argLength : 4 , reg : fpstore2 , asm : "FMOVS" , typ : "Mem" } , // store 32-bit float of arg2 to arg0 + arg1, arg3=mem.
{ name : "FMOVDstoreidx" , argLength : 4 , reg : fpstore2 , asm : "FMOVD" , typ : "Mem" } , // store 64-bit float of arg2 to arg0 + arg1, arg3=mem.
2018-04-22 00:51:00 +00:00
// shifted register indexed store
2020-11-27 17:10:33 +02:00
{ name : "MOVHstoreidx2" , argLength : 4 , reg : gpstore2 , asm : "MOVH" , typ : "Mem" } , // store 2 bytes of arg2 to arg0 + arg1*2, arg3 = mem.
{ name : "MOVWstoreidx4" , argLength : 4 , reg : gpstore2 , asm : "MOVW" , typ : "Mem" } , // store 4 bytes of arg2 to arg0 + arg1*4, arg3 = mem.
{ name : "MOVDstoreidx8" , argLength : 4 , reg : gpstore2 , asm : "MOVD" , typ : "Mem" } , // store 8 bytes of arg2 to arg0 + arg1*8, arg3 = mem.
{ name : "FMOVSstoreidx4" , argLength : 4 , reg : fpstore2 , asm : "FMOVS" , typ : "Mem" } , // store 32-bit float of arg2 to arg0 + arg1*4, arg3=mem.
{ name : "FMOVDstoreidx8" , argLength : 4 , reg : fpstore2 , asm : "FMOVD" , typ : "Mem" } , // store 64-bit float of arg2 to arg0 + arg1*8, arg3=mem.
2018-04-16 14:04:26 +00:00
2017-03-09 14:46:43 -08:00
{ name : "MOVBstorezero" , argLength : 2 , reg : gpstore0 , aux : "SymOff" , asm : "MOVB" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
{ name : "MOVHstorezero" , argLength : 2 , reg : gpstore0 , aux : "SymOff" , asm : "MOVH" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{ name : "MOVWstorezero" , argLength : 2 , reg : gpstore0 , aux : "SymOff" , asm : "MOVW" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
2017-07-27 01:55:03 +00:00
{ name : "MOVDstorezero" , argLength : 2 , reg : gpstore0 , aux : "SymOff" , asm : "MOVD" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 8 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{ name : "MOVQstorezero" , argLength : 2 , reg : gpstore0 , aux : "SymOff" , asm : "STP" , typ : "Mem" , faultOnNilArg0 : true , symEffect : "Write" } , // store 16 bytes of zero to arg0 + auxInt + aux. arg1=mem.
2016-08-10 13:24:03 -04:00
2018-04-16 14:04:26 +00:00
// register indexed store zero
2018-04-22 00:51:00 +00:00
{ name : "MOVBstorezeroidx" , argLength : 3 , reg : gpstore , asm : "MOVB" , typ : "Mem" } , // store 1 byte of zero to arg0 + arg1, arg2 = mem.
{ name : "MOVHstorezeroidx" , argLength : 3 , reg : gpstore , asm : "MOVH" , typ : "Mem" } , // store 2 bytes of zero to arg0 + arg1, arg2 = mem.
{ name : "MOVWstorezeroidx" , argLength : 3 , reg : gpstore , asm : "MOVW" , typ : "Mem" } , // store 4 bytes of zero to arg0 + arg1, arg2 = mem.
{ name : "MOVDstorezeroidx" , argLength : 3 , reg : gpstore , asm : "MOVD" , typ : "Mem" } , // store 8 bytes of zero to arg0 + arg1, arg2 = mem.
// shifted register indexed store zero
{ name : "MOVHstorezeroidx2" , argLength : 3 , reg : gpstore , asm : "MOVH" , typ : "Mem" } , // store 2 bytes of zero to arg0 + arg1*2, arg2 = mem.
{ name : "MOVWstorezeroidx4" , argLength : 3 , reg : gpstore , asm : "MOVW" , typ : "Mem" } , // store 4 bytes of zero to arg0 + arg1*4, arg2 = mem.
{ name : "MOVDstorezeroidx8" , argLength : 3 , reg : gpstore , asm : "MOVD" , typ : "Mem" } , // store 8 bytes of zero to arg0 + arg1*8, arg2 = mem.
2018-04-16 14:04:26 +00:00
2018-01-30 12:16:52 -05:00
{ name : "FMOVDgpfp" , argLength : 1 , reg : gpfp , asm : "FMOVD" } , // move int64 to float64 (no conversion)
{ name : "FMOVDfpgp" , argLength : 1 , reg : fpgp , asm : "FMOVD" } , // move float64 to int64 (no conversion)
2018-07-16 04:45:25 +00:00
{ name : "FMOVSgpfp" , argLength : 1 , reg : gpfp , asm : "FMOVS" } , // move 32bits from int to float reg (no conversion)
{ name : "FMOVSfpgp" , argLength : 1 , reg : fpgp , asm : "FMOVS" } , // move 32bits from float to int reg, zero extend (no conversion)
2018-01-30 12:16:52 -05:00
2016-07-21 12:42:49 -04:00
// conversions
{ name : "MOVBreg" , argLength : 1 , reg : gp11 , asm : "MOVB" } , // move from arg0, sign-extended from byte
{ name : "MOVBUreg" , argLength : 1 , reg : gp11 , asm : "MOVBU" } , // move from arg0, unsign-extended from byte
{ name : "MOVHreg" , argLength : 1 , reg : gp11 , asm : "MOVH" } , // move from arg0, sign-extended from half
{ name : "MOVHUreg" , argLength : 1 , reg : gp11 , asm : "MOVHU" } , // move from arg0, unsign-extended from half
{ name : "MOVWreg" , argLength : 1 , reg : gp11 , asm : "MOVW" } , // move from arg0, sign-extended from word
{ name : "MOVWUreg" , argLength : 1 , reg : gp11 , asm : "MOVWU" } , // move from arg0, unsign-extended from word
{ name : "MOVDreg" , argLength : 1 , reg : gp11 , asm : "MOVD" } , // move from arg0
2016-08-03 09:56:36 -04:00
{ name : "MOVDnop" , argLength : 1 , reg : regInfo { inputs : [ ] regMask { gp } , outputs : [ ] regMask { gp } } , resultInArg0 : true } , // nop, return arg0 in same register
2016-07-21 12:42:49 -04:00
{ name : "SCVTFWS" , argLength : 1 , reg : gpfp , asm : "SCVTFWS" } , // int32 -> float32
{ name : "SCVTFWD" , argLength : 1 , reg : gpfp , asm : "SCVTFWD" } , // int32 -> float64
{ name : "UCVTFWS" , argLength : 1 , reg : gpfp , asm : "UCVTFWS" } , // uint32 -> float32
{ name : "UCVTFWD" , argLength : 1 , reg : gpfp , asm : "UCVTFWD" } , // uint32 -> float64
{ name : "SCVTFS" , argLength : 1 , reg : gpfp , asm : "SCVTFS" } , // int64 -> float32
{ name : "SCVTFD" , argLength : 1 , reg : gpfp , asm : "SCVTFD" } , // int64 -> float64
{ name : "UCVTFS" , argLength : 1 , reg : gpfp , asm : "UCVTFS" } , // uint64 -> float32
{ name : "UCVTFD" , argLength : 1 , reg : gpfp , asm : "UCVTFD" } , // uint64 -> float64
{ name : "FCVTZSSW" , argLength : 1 , reg : fpgp , asm : "FCVTZSSW" } , // float32 -> int32
{ name : "FCVTZSDW" , argLength : 1 , reg : fpgp , asm : "FCVTZSDW" } , // float64 -> int32
{ name : "FCVTZUSW" , argLength : 1 , reg : fpgp , asm : "FCVTZUSW" } , // float32 -> uint32
{ name : "FCVTZUDW" , argLength : 1 , reg : fpgp , asm : "FCVTZUDW" } , // float64 -> uint32
{ name : "FCVTZSS" , argLength : 1 , reg : fpgp , asm : "FCVTZSS" } , // float32 -> int64
{ name : "FCVTZSD" , argLength : 1 , reg : fpgp , asm : "FCVTZSD" } , // float64 -> int64
{ name : "FCVTZUS" , argLength : 1 , reg : fpgp , asm : "FCVTZUS" } , // float32 -> uint64
{ name : "FCVTZUD" , argLength : 1 , reg : fpgp , asm : "FCVTZUD" } , // float64 -> uint64
{ name : "FCVTSD" , argLength : 1 , reg : fp11 , asm : "FCVTSD" } , // float32 -> float64
{ name : "FCVTDS" , argLength : 1 , reg : fp11 , asm : "FCVTDS" } , // float64 -> float32
2018-02-16 09:22:32 -05:00
// floating-point round to integral
{ name : "FRINTAD" , argLength : 1 , reg : fp11 , asm : "FRINTAD" } ,
{ name : "FRINTMD" , argLength : 1 , reg : fp11 , asm : "FRINTMD" } ,
2018-05-22 06:58:32 +00:00
{ name : "FRINTND" , argLength : 1 , reg : fp11 , asm : "FRINTND" } ,
2018-02-16 09:22:32 -05:00
{ name : "FRINTPD" , argLength : 1 , reg : fp11 , asm : "FRINTPD" } ,
{ name : "FRINTZD" , argLength : 1 , reg : fp11 , asm : "FRINTZD" } ,
2017-08-13 22:36:47 +00:00
// conditional instructions; auxint is
// one of the arm64 comparison pseudo-ops (LessThan, LessThanU, etc.)
cmd/compile: add rewrite rules for conditional instructions on arm64
This CL adds rewrite rules for CSETM, CSINC, CSINV, and CSNEG. By adding
these rules, we can save one instruction.
For example,
func test(cond bool, a int) int {
if cond {
a++
}
return a
}
Before:
MOVD "".a+8(RSP), R0
ADD $1, R0, R1
MOVBU "".cond(RSP), R2
CMPW $0, R2
CSEL NE, R1, R0, R0
After:
MOVBU "".cond(RSP), R0
CMPW $0, R0
MOVD "".a+8(RSP), R0
CSINC EQ, R0, R0, R0
This patch is a copy of CL 285694. Co-authored-by: JunchenLi
<junchen.li@arm.com>
Change-Id: Ic1a79e8b8ece409b533becfcb7950f11e7b76f24
Reviewed-on: https://go-review.googlesource.com/c/go/+/302231
Trust: fannie zhang <Fannie.Zhang@arm.com>
Run-TryBot: fannie zhang <Fannie.Zhang@arm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-01-18 14:32:49 +08:00
{ name : "CSEL" , argLength : 3 , reg : gp2flags1 , asm : "CSEL" , aux : "CCop" } , // auxint(flags) ? arg0 : arg1
{ name : "CSEL0" , argLength : 2 , reg : gp1flags1 , asm : "CSEL" , aux : "CCop" } , // auxint(flags) ? arg0 : 0
{ name : "CSINC" , argLength : 3 , reg : gp2flags1 , asm : "CSINC" , aux : "CCop" } , // auxint(flags) ? arg0 : arg1 + 1
{ name : "CSINV" , argLength : 3 , reg : gp2flags1 , asm : "CSINV" , aux : "CCop" } , // auxint(flags) ? arg0 : ^arg1
{ name : "CSNEG" , argLength : 3 , reg : gp2flags1 , asm : "CSNEG" , aux : "CCop" } , // auxint(flags) ? arg0 : -arg1
{ name : "CSETM" , argLength : 1 , reg : readflags , asm : "CSETM" , aux : "CCop" } , // auxint(flags) ? -1 : 0
2016-07-22 06:41:14 -04:00
2016-07-21 12:42:49 -04:00
// function calls
2021-05-28 22:23:00 -04:00
{ name : "CALLstatic" , argLength : - 1 , reg : regInfo { clobbers : callerSave } , aux : "CallOff" , clobberFlags : true , call : true } , // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
{ name : "CALLclosure" , argLength : - 1 , reg : regInfo { inputs : [ ] regMask { gpsp , buildReg ( "R26" ) , 0 } , clobbers : callerSave } , aux : "CallOff" , clobberFlags : true , call : true } , // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem
{ name : "CALLinter" , argLength : - 1 , reg : regInfo { inputs : [ ] regMask { gp } , clobbers : callerSave } , aux : "CallOff" , clobberFlags : true , call : true } , // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem
2016-07-21 12:42:49 -04:00
// pseudo-ops
2016-09-27 14:39:27 -07:00
{ name : "LoweredNilCheck" , argLength : 2 , reg : regInfo { inputs : [ ] regMask { gpg } } , nilCheck : true , faultOnNilArg0 : true } , // panic if arg0 is nil. arg1=mem.
2016-07-21 12:42:49 -04:00
cmd/compile: fix wrong complement for arm64 floating-point comparisons
Consider the following example,
func test(a, b float64, x uint64) uint64 {
if a < b {
x = 0
}
return x
}
func main() {
fmt.Println(test(1, math.NaN(), 123))
}
The output is 0, but the expectation is 123.
This is because the rewrite rule
(CSEL [cc] (MOVDconst [0]) y flag) => (CSEL0 [arm64Negate(cc)] y flag)
converts
FCMP NaN, 1
CSEL MI, 0, 123, R0 // if 1 < NaN then R0 = 0 else R0 = 123
to
FCMP NaN, 1
CSEL GE, 123, 0, R0 // if 1 >= NaN then R0 = 123 else R0 = 0
But both 1 < NaN and 1 >= NaN are false. So the output is 0, not 123.
The root cause is arm64Negate not handle negation of floating comparison
correctly. According to the ARM manual, the meaning of MI, GE, and PL
are
MI: Less than
GE: Greater than or equal to
PL: Greater than, equal to, or unordered
Because NaN cannot be compared with other numbers, the result of such
comparison is unordered. So when NaN is involved, unlike integer, the
result of !(a < b) is not a >= b, it is a >= b || a is NaN || b is NaN.
This is exactly what PL means. We add NotLessThanF to represent PL. Then
the negation of LessThanF is NotLessThanF rather than GreaterEqualF. The
same reason for the other floating comparison operations.
Fixes #43619
Change-Id: Ia511b0027ad067436bace9fbfd261dbeaae01bcd
Reviewed-on: https://go-review.googlesource.com/c/go/+/283572
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Keith Randall <khr@golang.org>
2021-01-08 10:20:34 +08:00
{ name : "Equal" , argLength : 1 , reg : readflags } , // bool, true flags encode x==y false otherwise.
{ name : "NotEqual" , argLength : 1 , reg : readflags } , // bool, true flags encode x!=y false otherwise.
{ name : "LessThan" , argLength : 1 , reg : readflags } , // bool, true flags encode signed x<y false otherwise.
{ name : "LessEqual" , argLength : 1 , reg : readflags } , // bool, true flags encode signed x<=y false otherwise.
{ name : "GreaterThan" , argLength : 1 , reg : readflags } , // bool, true flags encode signed x>y false otherwise.
{ name : "GreaterEqual" , argLength : 1 , reg : readflags } , // bool, true flags encode signed x>=y false otherwise.
{ name : "LessThanU" , argLength : 1 , reg : readflags } , // bool, true flags encode unsigned x<y false otherwise.
{ name : "LessEqualU" , argLength : 1 , reg : readflags } , // bool, true flags encode unsigned x<=y false otherwise.
{ name : "GreaterThanU" , argLength : 1 , reg : readflags } , // bool, true flags encode unsigned x>y false otherwise.
{ name : "GreaterEqualU" , argLength : 1 , reg : readflags } , // bool, true flags encode unsigned x>=y false otherwise.
{ name : "LessThanF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x<y false otherwise.
{ name : "LessEqualF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x<=y false otherwise.
{ name : "GreaterThanF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x>y false otherwise.
{ name : "GreaterEqualF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x>=y false otherwise.
{ name : "NotLessThanF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x>=y || x is unordered with y, false otherwise.
{ name : "NotLessEqualF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x>y || x is unordered with y, false otherwise.
{ name : "NotGreaterThanF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x<=y || x is unordered with y, false otherwise.
{ name : "NotGreaterEqualF" , argLength : 1 , reg : readflags } , // bool, true flags encode floating-point x<y || x is unordered with y, false otherwise.
2016-07-22 06:41:14 -04:00
// duffzero
// arg0 = address of memory to zero
// arg1 = mem
// auxint = offset into duffzero code to start executing
// returns mem
2019-06-25 14:48:04 -04:00
// R20 changed as side effect
2019-06-28 09:30:36 -04:00
// R16 and R17 may be clobbered by linker trampoline.
2016-07-22 06:41:14 -04:00
{
name : "DUFFZERO" ,
aux : "Int64" ,
argLength : 2 ,
reg : regInfo {
2019-06-25 14:48:04 -04:00
inputs : [ ] regMask { buildReg ( "R20" ) } ,
2019-06-28 09:30:36 -04:00
clobbers : buildReg ( "R16 R17 R20 R30" ) ,
2016-07-22 06:41:14 -04:00
} ,
2016-09-13 17:01:01 -07:00
faultOnNilArg0 : true ,
cmd/compile: mark DUFFZERO/DUFFCOPY as async unsafe
These operations are async unsafe on architectures that use
frame pointers.
The reason is they rely on data being safe when stored below the stack
pointer. They do:
45da69: 48 89 6c 24 f0 mov %rbp,-0x10(%rsp)
45da6e: 48 8d 6c 24 f0 lea -0x10(%rsp),%rbp
45da73: e8 7d d0 ff ff callq 45aaf5 <runtime.duffzero+0x115>
45da78: 48 8b 6d 00 mov 0x0(%rbp),%rbp
This dance ensures that inside duffzero, it looks like there is a
proper frame pointer set up, so that stack walkbacks work correctly if
the kernel samples during duffzero.
However, this instruction sequence depends on data not being clobbered
even though it is below the stack pointer.
If there is an async interrupt at any of those last 3 instructions,
and the interrupt decides to insert a call to asyncPreempt, then the
saved frame pointer on the stack gets clobbered. The last instruction
above then restores junk to the frame pointer.
To prevent this, mark these instructions as async unsafe.
(The body of duffzero is already async unsafe, as it is in package runtime.)
Change-Id: I5562e82f9f5bd2fb543dcf2b6b9133d87ff83032
Reviewed-on: https://go-review.googlesource.com/c/go/+/248261
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-08-11 13:07:35 -07:00
unsafePoint : true , // FP maintenance around DUFFZERO can be clobbered by interrupts
2016-07-22 06:41:14 -04:00
} ,
// large zeroing
// arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
2017-07-27 01:55:03 +00:00
// arg1 = address of the last 16-byte unit to zero
2016-07-22 06:41:14 -04:00
// arg2 = mem
// returns mem
2017-07-27 01:55:03 +00:00
// STP.P (ZR,ZR), 16(R16)
2016-07-22 06:41:14 -04:00
// CMP Rarg1, R16
// BLE -2(PC)
// Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
2017-07-27 01:55:03 +00:00
// the-end-of-the-memory - 16 is with the area to zero, ok to spill.
2016-07-22 06:41:14 -04:00
{
name : "LoweredZero" ,
argLength : 3 ,
reg : regInfo {
inputs : [ ] regMask { buildReg ( "R16" ) , gp } ,
2016-08-04 06:57:34 -04:00
clobbers : buildReg ( "R16" ) ,
2016-07-22 06:41:14 -04:00
} ,
2016-09-13 17:01:01 -07:00
clobberFlags : true ,
faultOnNilArg0 : true ,
2016-07-22 06:41:14 -04:00
} ,
2016-09-27 08:57:02 -04:00
// duffcopy
2019-06-25 14:48:04 -04:00
// arg0 = address of dst memory (in R21, changed as side effect)
// arg1 = address of src memory (in R20, changed as side effect)
2016-09-27 08:57:02 -04:00
// arg2 = mem
// auxint = offset into duffcopy code to start executing
// returns mem
2019-06-25 14:48:04 -04:00
// R20, R21 changed as side effect
2019-06-28 09:30:36 -04:00
// R16 and R17 may be clobbered by linker trampoline.
2016-09-27 08:57:02 -04:00
{
name : "DUFFCOPY" ,
aux : "Int64" ,
argLength : 3 ,
reg : regInfo {
2019-06-25 14:48:04 -04:00
inputs : [ ] regMask { buildReg ( "R21" ) , buildReg ( "R20" ) } ,
2019-06-28 09:30:36 -04:00
clobbers : buildReg ( "R16 R17 R20 R21 R26 R30" ) ,
2016-09-27 08:57:02 -04:00
} ,
faultOnNilArg0 : true ,
faultOnNilArg1 : true ,
cmd/compile: mark DUFFZERO/DUFFCOPY as async unsafe
These operations are async unsafe on architectures that use
frame pointers.
The reason is they rely on data being safe when stored below the stack
pointer. They do:
45da69: 48 89 6c 24 f0 mov %rbp,-0x10(%rsp)
45da6e: 48 8d 6c 24 f0 lea -0x10(%rsp),%rbp
45da73: e8 7d d0 ff ff callq 45aaf5 <runtime.duffzero+0x115>
45da78: 48 8b 6d 00 mov 0x0(%rbp),%rbp
This dance ensures that inside duffzero, it looks like there is a
proper frame pointer set up, so that stack walkbacks work correctly if
the kernel samples during duffzero.
However, this instruction sequence depends on data not being clobbered
even though it is below the stack pointer.
If there is an async interrupt at any of those last 3 instructions,
and the interrupt decides to insert a call to asyncPreempt, then the
saved frame pointer on the stack gets clobbered. The last instruction
above then restores junk to the frame pointer.
To prevent this, mark these instructions as async unsafe.
(The body of duffzero is already async unsafe, as it is in package runtime.)
Change-Id: I5562e82f9f5bd2fb543dcf2b6b9133d87ff83032
Reviewed-on: https://go-review.googlesource.com/c/go/+/248261
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-08-11 13:07:35 -07:00
unsafePoint : true , // FP maintenance around DUFFCOPY can be clobbered by interrupts
2016-09-27 08:57:02 -04:00
} ,
2016-07-22 06:41:14 -04:00
// large move
// arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
// arg2 = address of the last element of src
// arg3 = mem
// returns mem
// MOVD.P 8(R16), Rtmp
// MOVD.P Rtmp, 8(R17)
// CMP Rarg2, R16
// BLE -3(PC)
// Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
// the-end-of-src - 8 is within the area to copy, ok to spill.
{
name : "LoweredMove" ,
argLength : 4 ,
reg : regInfo {
inputs : [ ] regMask { buildReg ( "R17" ) , buildReg ( "R16" ) , gp } ,
2016-08-04 06:57:34 -04:00
clobbers : buildReg ( "R16 R17" ) ,
2016-07-22 06:41:14 -04:00
} ,
2016-09-13 17:01:01 -07:00
clobberFlags : true ,
faultOnNilArg0 : true ,
faultOnNilArg1 : true ,
2016-07-22 06:41:14 -04:00
} ,
2016-07-21 12:42:49 -04:00
// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
// and sorts it to the very beginning of the block to prevent other
// use of R26 (arm64.REGCTXT, the closure pointer)
2018-02-28 16:30:07 -05:00
{ name : "LoweredGetClosurePtr" , reg : regInfo { outputs : [ ] regMask { buildReg ( "R26" ) } } , zeroWidth : true } ,
2016-07-21 12:42:49 -04:00
2017-10-09 15:33:29 -04:00
// LoweredGetCallerSP returns the SP of the caller of the current function.
{ name : "LoweredGetCallerSP" , reg : gp01 , rematerializeable : true } ,
2018-04-25 08:38:09 +00:00
// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
// I.e., if f calls g "calls" getcallerpc,
// the result should be the PC within f that g will return to.
// See runtime/stubs.go for a more detailed discussion.
{ name : "LoweredGetCallerPC" , reg : gp01 , rematerializeable : true } ,
2020-06-15 22:52:56 -07:00
// Constant flag value.
// Note: there's an "unordered" outcome for floating-point
2016-07-21 12:42:49 -04:00
// comparisons, but we don't use such a beast yet.
2020-06-15 22:52:56 -07:00
// This op is for temporary use by rewrite rules. It
2016-07-21 12:42:49 -04:00
// cannot appear in the generated assembly.
2020-06-15 22:52:56 -07:00
{ name : "FlagConstant" , aux : "FlagConstant" } ,
2016-07-21 12:42:49 -04:00
// (InvertFlags (CMP a b)) == (CMP b a)
// InvertFlags is a pseudo-op which can't appear in assembly output.
{ name : "InvertFlags" , argLength : 1 } , // reverse direction of arg0
2016-08-29 16:26:57 -04:00
// atomic loads.
2016-09-12 15:24:11 -04:00
// load from arg0. arg1=mem. auxint must be zero.
2016-08-29 16:26:57 -04:00
// returns <value,memory> so they can be properly ordered with other loads.
2016-09-13 17:01:01 -07:00
{ name : "LDAR" , argLength : 2 , reg : gpload , asm : "LDAR" , faultOnNilArg0 : true } ,
2019-03-28 14:58:06 -04:00
{ name : "LDARB" , argLength : 2 , reg : gpload , asm : "LDARB" , faultOnNilArg0 : true } ,
2016-09-13 17:01:01 -07:00
{ name : "LDARW" , argLength : 2 , reg : gpload , asm : "LDARW" , faultOnNilArg0 : true } ,
2016-08-29 16:26:57 -04:00
// atomic stores.
2016-09-12 15:24:11 -04:00
// store arg1 to arg0. arg2=mem. returns memory. auxint must be zero.
2019-10-23 10:20:49 -04:00
{ name : "STLRB" , argLength : 3 , reg : gpstore , asm : "STLRB" , faultOnNilArg0 : true , hasSideEffects : true } ,
2017-02-21 15:22:52 -05:00
{ name : "STLR" , argLength : 3 , reg : gpstore , asm : "STLR" , faultOnNilArg0 : true , hasSideEffects : true } ,
{ name : "STLRW" , argLength : 3 , reg : gpstore , asm : "STLRW" , faultOnNilArg0 : true , hasSideEffects : true } ,
2016-08-29 16:26:57 -04:00
// atomic exchange.
2016-09-12 15:24:11 -04:00
// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
2016-08-29 16:26:57 -04:00
// LDAXR (Rarg0), Rout
// STLXR Rarg1, (Rarg0), Rtmp
// CBNZ Rtmp, -2(PC)
2019-10-25 00:51:10 -04:00
{ name : "LoweredAtomicExchange64" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
{ name : "LoweredAtomicExchange32" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2016-08-29 16:26:57 -04:00
2020-11-04 16:18:23 +00:00
// atomic exchange variant.
// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>. auxint must be zero.
// SWPALD Rarg1, (Rarg0), Rout
{ name : "LoweredAtomicExchange64Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true } ,
{ name : "LoweredAtomicExchange32Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true } ,
2016-08-29 16:26:57 -04:00
// atomic add.
2016-09-12 15:24:11 -04:00
// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
2016-08-29 16:26:57 -04:00
// LDAXR (Rarg0), Rout
// ADD Rarg1, Rout
// STLXR Rout, (Rarg0), Rtmp
// CBNZ Rtmp, -3(PC)
2019-10-25 00:51:10 -04:00
{ name : "LoweredAtomicAdd64" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
{ name : "LoweredAtomicAdd32" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2016-08-29 16:26:57 -04:00
2017-11-03 02:05:28 +00:00
// atomic add variant.
// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
// LDADDAL (Rarg0), Rarg1, Rout
// ADD Rarg1, Rout
{ name : "LoweredAtomicAdd64Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true } ,
{ name : "LoweredAtomicAdd32Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , faultOnNilArg0 : true , hasSideEffects : true } ,
2016-08-29 16:26:57 -04:00
// atomic compare and swap.
2016-09-12 15:24:11 -04:00
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
2016-08-29 16:26:57 -04:00
// if *arg0 == arg1 {
// *arg0 = arg2
// return (true, memory)
// } else {
// return (false, memory)
// }
// LDAXR (Rarg0), Rtmp
// CMP Rarg1, Rtmp
// BNE 3(PC)
// STLXR Rarg2, (Rarg0), Rtmp
// CBNZ Rtmp, -4(PC)
// CSET EQ, Rout
2019-10-25 00:51:10 -04:00
{ name : "LoweredAtomicCas64" , argLength : 4 , reg : gpcas , resultNotInArgs : true , clobberFlags : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
{ name : "LoweredAtomicCas32" , argLength : 4 , reg : gpcas , resultNotInArgs : true , clobberFlags : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2016-09-12 15:24:11 -04:00
2020-11-04 16:18:23 +00:00
// atomic compare and swap variant.
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
// if *arg0 == arg1 {
// *arg0 = arg2
// return (true, memory)
// } else {
// return (false, memory)
// }
// MOV Rarg1, Rtmp
// CASAL Rtmp, (Rarg0), Rarg2
// CMP Rarg1, Rtmp
// CSET EQ, Rout
{ name : "LoweredAtomicCas64Variant" , argLength : 4 , reg : gpcas , resultNotInArgs : true , clobberFlags : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
{ name : "LoweredAtomicCas32Variant" , argLength : 4 , reg : gpcas , resultNotInArgs : true , clobberFlags : true , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2016-09-12 15:24:11 -04:00
// atomic and/or.
2018-06-11 13:41:23 -04:00
// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
2020-10-16 16:34:52 -04:00
// LDAXR (Rarg0), Rout
2018-06-11 13:41:23 -04:00
// AND/OR Rarg1, Rout
2020-10-16 16:34:52 -04:00
// STLXR Rout, (Rarg0), Rtmp
2016-09-12 15:24:11 -04:00
// CBNZ Rtmp, -3(PC)
2019-10-25 00:51:10 -04:00
{ name : "LoweredAtomicAnd8" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , asm : "AND" , typ : "(UInt8,Mem)" , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2020-10-16 16:34:52 -04:00
{ name : "LoweredAtomicAnd32" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , asm : "AND" , typ : "(UInt32,Mem)" , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2019-10-25 00:51:10 -04:00
{ name : "LoweredAtomicOr8" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , asm : "ORR" , typ : "(UInt8,Mem)" , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2020-10-16 16:34:52 -04:00
{ name : "LoweredAtomicOr32" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , asm : "ORR" , typ : "(UInt32,Mem)" , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
2017-11-15 14:54:24 -08:00
2020-11-04 16:18:23 +00:00
// atomic and/or variant.
// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
// AND:
// MNV Rarg1, Rtemp
// LDANDALB Rtemp, (Rarg0), Rout
// AND Rarg1, Rout
// OR:
// LDORALB Rarg1, (Rarg0), Rout
// ORR Rarg1, Rout
{ name : "LoweredAtomicAnd8Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , typ : "(UInt8,Mem)" , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
{ name : "LoweredAtomicAnd32Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , typ : "(UInt32,Mem)" , faultOnNilArg0 : true , hasSideEffects : true , unsafePoint : true } ,
{ name : "LoweredAtomicOr8Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , typ : "(UInt8,Mem)" , faultOnNilArg0 : true , hasSideEffects : true } ,
{ name : "LoweredAtomicOr32Variant" , argLength : 3 , reg : gpxchg , resultNotInArgs : true , typ : "(UInt32,Mem)" , faultOnNilArg0 : true , hasSideEffects : true } ,
2017-11-15 14:54:24 -08:00
// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
// It saves all GP registers if necessary,
// but clobbers R30 (LR) because it's a call.
2019-06-28 09:30:36 -04:00
// R16 and R17 may be clobbered by linker trampoline.
{ name : "LoweredWB" , argLength : 3 , reg : regInfo { inputs : [ ] regMask { buildReg ( "R2" ) , buildReg ( "R3" ) } , clobbers : ( callerSave &^ gpg ) | buildReg ( "R16 R17 R30" ) } , clobberFlags : true , aux : "Sym" , symEffect : "None" } ,
2019-02-06 14:12:36 -08:00
// There are three of these functions so that they can have three different register inputs.
// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
// default registers to match so we don't need to copy registers around unnecessarily.
2020-04-27 15:58:16 -04:00
{ name : "LoweredPanicBoundsA" , argLength : 3 , aux : "Int64" , reg : regInfo { inputs : [ ] regMask { r2 , r3 } } , typ : "Mem" , call : true } , // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
{ name : "LoweredPanicBoundsB" , argLength : 3 , aux : "Int64" , reg : regInfo { inputs : [ ] regMask { r1 , r2 } } , typ : "Mem" , call : true } , // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
{ name : "LoweredPanicBoundsC" , argLength : 3 , aux : "Int64" , reg : regInfo { inputs : [ ] regMask { r0 , r1 } } , typ : "Mem" , call : true } , // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
2021-06-15 14:04:30 +00:00
// Prefetch instruction
// Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option.
{ name : "PRFM" , argLength : 2 , aux : "Int64" , reg : prefreg , asm : "PRFM" , hasSideEffects : true } ,
2016-07-21 12:42:49 -04:00
}
blocks := [ ] blockData {
2019-08-12 20:19:58 +01:00
{ name : "EQ" , controls : 1 } ,
{ name : "NE" , controls : 1 } ,
{ name : "LT" , controls : 1 } ,
{ name : "LE" , controls : 1 } ,
{ name : "GT" , controls : 1 } ,
{ name : "GE" , controls : 1 } ,
{ name : "ULT" , controls : 1 } ,
{ name : "ULE" , controls : 1 } ,
{ name : "UGT" , controls : 1 } ,
{ name : "UGE" , controls : 1 } ,
2020-04-30 11:04:02 +02:00
{ name : "Z" , controls : 1 } , // Control == 0 (take a register instead of flags)
{ name : "NZ" , controls : 1 } , // Control != 0
{ name : "ZW" , controls : 1 } , // Control == 0, 32-bit
{ name : "NZW" , controls : 1 } , // Control != 0, 32-bit
{ name : "TBZ" , controls : 1 , aux : "Int64" } , // Control & (1 << AuxInt) == 0
{ name : "TBNZ" , controls : 1 , aux : "Int64" } , // Control & (1 << AuxInt) != 0
2019-08-12 20:19:58 +01:00
{ name : "FLT" , controls : 1 } ,
{ name : "FLE" , controls : 1 } ,
{ name : "FGT" , controls : 1 } ,
{ name : "FGE" , controls : 1 } ,
cmd/compile: fix incorrect rewriting to if condition
Some ARM64 rewriting rules convert 'comparing to zero' conditions of if
statements to a simplified version utilizing CMN and CMP instructions to
branch over condition flags, in order to save one Add or Sub caculation.
Such optimizations lead to wrong branching in case an overflow/underflow
occurs when executing CMN or CMP.
Fix the issue by introducing new block opcodes that don't honor the
overflow/underflow flag, in the following categories:
Block-Op Meaning ARM condition codes
1. LTnoov less than MI
2. GEnoov greater than or equal PL
3. LEnoov less than or equal MI || EQ
4. GTnoov greater than NEQ & PL
The backend generates two consecutive branch instructions for 'LEnoov'
and 'GTnoov' to model their expected behavior. A slight change to 'gc'
and amd64/386 backends is made to unify the code generation.
Add a test 'TestCondRewrite' as justification, it covers 32 incorrect rules
identified on arm64, more might be needed on other arches, like 32-bit arm.
Add two benchmarks profiling the aforementioned category 1&2 and category
3&4 separetely, we expect the first two categories will show performance
improvement and the second will not result in visible regression compared with
the non-optimized version.
This change also updates TestFormats to support using %#x.
Examples exhibiting where does the issue come from:
1: 'if x + 3 < 0' might be converted to:
before:
CMN $3, R0
BGE <else branch> // wrong branch is taken if 'x+3' overflows
after:
CMN $3, R0
BPL <else branch>
2: 'if y - 3 > 0' might be converted to:
before:
CMP $3, R0
BLE <else branch> // wrong branch is taken if 'y-3' underflows
after:
CMP $3, R0
BMI <else branch>
BEQ <else branch>
Benchmark data from different kinds of arm64 servers, 'old' is the non-optimized
version (not the parent commit), generally the optimization version outperforms.
S1:
name old time/op new time/op delta
CondRewrite/SoloJump 13.6ns ± 0% 12.9ns ± 0% -5.15% (p=0.000 n=10+10)
CondRewrite/CombJump 13.8ns ± 1% 12.9ns ± 0% -6.32% (p=0.000 n=10+10)
S2:
name old time/op new time/op delta
CondRewrite/SoloJump 11.6ns ± 0% 10.9ns ± 0% -6.03% (p=0.000 n=10+10)
CondRewrite/CombJump 11.4ns ± 0% 10.8ns ± 1% -5.53% (p=0.000 n=10+10)
S3:
name old time/op new time/op delta
CondRewrite/SoloJump 7.36ns ± 0% 7.50ns ± 0% +1.79% (p=0.000 n=9+10)
CondRewrite/CombJump 7.35ns ± 0% 7.75ns ± 0% +5.51% (p=0.000 n=8+9)
S4:
name old time/op new time/op delta
CondRewrite/SoloJump-224 11.5ns ± 1% 10.9ns ± 0% -4.97% (p=0.000 n=10+10)
CondRewrite/CombJump-224 11.9ns ± 0% 11.5ns ± 0% -2.95% (p=0.000 n=10+10)
S5:
name old time/op new time/op delta
CondRewrite/SoloJump 10.0ns ± 0% 10.0ns ± 0% -0.45% (p=0.000 n=9+10)
CondRewrite/CombJump 9.93ns ± 0% 9.77ns ± 0% -1.53% (p=0.000 n=10+9)
Go1 perf. data:
name old time/op new time/op delta
BinaryTree17 6.29s ± 1% 6.30s ± 1% ~ (p=1.000 n=5+5)
Fannkuch11 5.40s ± 0% 5.40s ± 0% ~ (p=0.841 n=5+5)
FmtFprintfEmpty 97.9ns ± 0% 98.9ns ± 3% ~ (p=0.937 n=4+5)
FmtFprintfString 171ns ± 3% 171ns ± 2% ~ (p=0.754 n=5+5)
FmtFprintfInt 212ns ± 0% 217ns ± 6% +2.55% (p=0.008 n=5+5)
FmtFprintfIntInt 296ns ± 1% 297ns ± 2% ~ (p=0.516 n=5+5)
FmtFprintfPrefixedInt 371ns ± 2% 374ns ± 7% ~ (p=1.000 n=5+5)
FmtFprintfFloat 435ns ± 1% 439ns ± 2% ~ (p=0.056 n=5+5)
FmtManyArgs 1.37µs ± 1% 1.36µs ± 1% ~ (p=0.730 n=5+5)
GobDecode 14.6ms ± 4% 14.4ms ± 4% ~ (p=0.690 n=5+5)
GobEncode 11.8ms ±20% 11.6ms ±15% ~ (p=1.000 n=5+5)
Gzip 507ms ± 0% 491ms ± 0% -3.22% (p=0.008 n=5+5)
Gunzip 73.8ms ± 0% 73.9ms ± 0% ~ (p=0.690 n=5+5)
HTTPClientServer 116µs ± 0% 116µs ± 0% ~ (p=0.686 n=4+4)
JSONEncode 21.8ms ± 1% 21.6ms ± 2% ~ (p=0.151 n=5+5)
JSONDecode 104ms ± 1% 103ms ± 1% -1.08% (p=0.016 n=5+5)
Mandelbrot200 9.53ms ± 0% 9.53ms ± 0% ~ (p=0.421 n=5+5)
GoParse 7.55ms ± 1% 7.51ms ± 1% ~ (p=0.151 n=5+5)
RegexpMatchEasy0_32 158ns ± 0% 158ns ± 0% ~ (all equal)
RegexpMatchEasy0_1K 606ns ± 1% 608ns ± 3% ~ (p=0.937 n=5+5)
RegexpMatchEasy1_32 143ns ± 0% 144ns ± 1% ~ (p=0.095 n=5+4)
RegexpMatchEasy1_1K 927ns ± 2% 944ns ± 2% ~ (p=0.056 n=5+5)
RegexpMatchMedium_32 16.0ns ± 0% 16.0ns ± 0% ~ (all equal)
RegexpMatchMedium_1K 69.3µs ± 2% 69.7µs ± 0% ~ (p=0.690 n=5+5)
RegexpMatchHard_32 3.73µs ± 0% 3.73µs ± 1% ~ (p=0.984 n=5+5)
RegexpMatchHard_1K 111µs ± 1% 110µs ± 0% ~ (p=0.151 n=5+5)
Revcomp 1.91s ±47% 1.77s ±68% ~ (p=1.000 n=5+5)
Template 138ms ± 1% 138ms ± 1% ~ (p=1.000 n=5+5)
TimeParse 787ns ± 2% 785ns ± 1% ~ (p=0.540 n=5+5)
TimeFormat 729ns ± 1% 726ns ± 1% ~ (p=0.151 n=5+5)
Updates #38740
Change-Id: I06c604874acdc1e63e66452dadee5df053045222
Reviewed-on: https://go-review.googlesource.com/c/go/+/233097
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
2020-05-06 09:54:40 +00:00
{ name : "LTnoov" , controls : 1 } , // 'LT' but without honoring overflow
{ name : "LEnoov" , controls : 1 } , // 'LE' but without honoring overflow
{ name : "GTnoov" , controls : 1 } , // 'GT' but without honoring overflow
{ name : "GEnoov" , controls : 1 } , // 'GE' but without honoring overflow
2016-07-21 12:42:49 -04:00
}
archs = append ( archs , arch {
2021-06-01 16:57:59 -07:00
name : "ARM64" ,
pkg : "cmd/internal/obj/arm64" ,
genfile : "../../arm64/ssa.go" ,
ops : ops ,
blocks : blocks ,
regnames : regNamesARM64 ,
2021-05-25 11:53:04 -04:00
ParamIntRegNames : "R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15" ,
ParamFloatRegNames : "F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15" ,
2021-06-01 16:57:59 -07:00
gpregmask : gp ,
fpregmask : fp ,
framepointerreg : - 1 , // not used
linkreg : int8 ( num [ "R30" ] ) ,
2016-07-21 12:42:49 -04:00
} )
}