2022-05-14 22:45:05 -04:00
|
|
|
// Code generated from _gen/RISCV64.rules using 'go generate'; DO NOT EDIT.
|
2019-11-04 04:40:47 +11:00
|
|
|
|
|
|
|
|
package ssa
|
|
|
|
|
|
2021-04-27 17:19:42 -04:00
|
|
|
import "math"
|
|
|
|
|
import "cmd/compile/internal/types"
|
2019-11-04 04:40:47 +11:00
|
|
|
|
|
|
|
|
func rewriteValueRISCV64(v *Value) bool {
|
|
|
|
|
switch v.Op {
|
2021-09-09 23:47:14 +01:00
|
|
|
case OpAbs:
|
|
|
|
|
v.Op = OpRISCV64FABSD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAdd16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64ADD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAdd32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64ADD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAdd32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FADDS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAdd64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64ADD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAdd64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FADDD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAdd8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64ADD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAddPtr:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64ADD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAddr:
|
2020-04-12 20:05:14 -07:00
|
|
|
return rewriteValueRISCV64_OpAddr(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAnd16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64AND
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAnd32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64AND
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAnd64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64AND
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAnd8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64AND
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAndB:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64AND
|
|
|
|
|
return true
|
2020-03-16 02:51:54 +11:00
|
|
|
case OpAtomicAdd32:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicAdd32
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicAdd64:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicAdd64
|
|
|
|
|
return true
|
2021-02-27 19:07:32 +11:00
|
|
|
case OpAtomicAnd32:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicAnd32
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicAnd8:
|
|
|
|
|
return rewriteValueRISCV64_OpAtomicAnd8(v)
|
2020-03-16 02:51:54 +11:00
|
|
|
case OpAtomicCompareAndSwap32:
|
cmd/compile: sign-extend the 2nd argument of the LoweredAtomicCas32 on loong64,mips64x,riscv64
The function LoweredAtomicCas32 is implemented using the LL-SC instruction pair
on loong64, mips64x, riscv64. However,the LL instruction on loong64, mips64x,
riscv64 is sign-extended, so it is necessary to sign-extend the 2nd parameter
"old" of the LoweredAtomicCas32, so that the instruction BNE after LL can get
the desired result.
The function prototype of LoweredAtomicCas32 in golang:
func Cas32(ptr *uint32, old, new uint32) bool
When using an intrinsify implementation:
case 1: (*ptr) <= 0x80000000 && old < 0x80000000
E.g: (*ptr) = 0x7FFFFFFF, old = Rarg1= 0x7FFFFFFF
After run the instruction "LL (Rarg0), Rtmp": Rtmp = 0x7FFFFFFF
Rtmp ! = Rarg1(old) is false, the result we expect
case 2: (*ptr) >= 0x80000000 && old >= 0x80000000
E.g: (*ptr) = 0x80000000, old = Rarg1= 0x80000000
After run the instruction "LL (Rarg0), Rtmp": Rtmp = 0xFFFFFFFF_80000000
Rtmp ! = Rarg1(old) is true, which we do not expect
When using an non-intrinsify implementation:
Because Rarg1 is loaded from the stack using sign-extended instructions
ld.w, the situation described in Case 2 above does not occur
Benchmarks on linux/loong64:
name old time/op new time/op delta
Cas 50.0ns ± 0% 50.1ns ± 0% ~ (p=1.000 n=1+1)
Cas64 50.0ns ± 0% 50.1ns ± 0% ~ (p=1.000 n=1+1)
Cas-4 56.0ns ± 0% 56.0ns ± 0% ~ (p=1.000 n=1+1)
Cas64-4 56.0ns ± 0% 56.0ns ± 0% ~ (p=1.000 n=1+1)
Benchmarks on Loongson 3A4000 (GOARCH=mips64le, 1.8GHz)
name old time/op new time/op delta
Cas 70.4ns ± 0% 70.3ns ± 0% ~ (p=1.000 n=1+1)
Cas64 70.7ns ± 0% 70.6ns ± 0% ~ (p=1.000 n=1+1)
Cas-4 81.1ns ± 0% 80.8ns ± 0% ~ (p=1.000 n=1+1)
Cas64-4 80.9ns ± 0% 80.9ns ± 0% ~ (p=1.000 n=1+1)
Fixes #57282
Change-Id: I190a7fc648023b15fa392f7fdda5ac18c1561bac
Reviewed-on: https://go-review.googlesource.com/c/go/+/457135
Run-TryBot: Than McIntosh <thanm@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Wayne Zuo <wdvxdr@golangcn.org>
Reviewed-by: Than McIntosh <thanm@google.com>
Reviewed-by: David Chase <drchase@google.com>
2022-12-19 05:04:48 +08:00
|
|
|
return rewriteValueRISCV64_OpAtomicCompareAndSwap32(v)
|
2020-03-16 02:51:54 +11:00
|
|
|
case OpAtomicCompareAndSwap64:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicCas64
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicExchange32:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicExchange32
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicExchange64:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicExchange64
|
|
|
|
|
return true
|
2020-03-16 02:47:40 +11:00
|
|
|
case OpAtomicLoad32:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicLoad32
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicLoad64:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicLoad64
|
|
|
|
|
return true
|
2020-03-16 02:38:43 +11:00
|
|
|
case OpAtomicLoad8:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicLoad8
|
|
|
|
|
return true
|
2020-03-16 02:47:40 +11:00
|
|
|
case OpAtomicLoadPtr:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicLoad64
|
|
|
|
|
return true
|
2021-02-27 19:07:32 +11:00
|
|
|
case OpAtomicOr32:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicOr32
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicOr8:
|
|
|
|
|
return rewriteValueRISCV64_OpAtomicOr8(v)
|
2020-03-16 02:47:40 +11:00
|
|
|
case OpAtomicStore32:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicStore32
|
|
|
|
|
return true
|
|
|
|
|
case OpAtomicStore64:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicStore64
|
|
|
|
|
return true
|
2020-03-16 02:38:43 +11:00
|
|
|
case OpAtomicStore8:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicStore8
|
|
|
|
|
return true
|
2020-03-16 02:47:40 +11:00
|
|
|
case OpAtomicStorePtrNoWB:
|
|
|
|
|
v.Op = OpRISCV64LoweredAtomicStore64
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpAvg64u:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpAvg64u(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpClosureCall:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64CALLclosure
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCom16:
|
2020-03-03 03:43:02 +11:00
|
|
|
v.Op = OpRISCV64NOT
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCom32:
|
2020-03-03 03:43:02 +11:00
|
|
|
v.Op = OpRISCV64NOT
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCom64:
|
2020-03-03 03:43:02 +11:00
|
|
|
v.Op = OpRISCV64NOT
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCom8:
|
2020-03-03 03:43:02 +11:00
|
|
|
v.Op = OpRISCV64NOT
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConst16:
|
2021-03-18 03:37:58 +11:00
|
|
|
return rewriteValueRISCV64_OpConst16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConst32:
|
2021-03-18 03:37:58 +11:00
|
|
|
return rewriteValueRISCV64_OpConst32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConst32F:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpConst32F(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConst64:
|
2021-03-18 03:37:58 +11:00
|
|
|
return rewriteValueRISCV64_OpConst64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConst64F:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpConst64F(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConst8:
|
2021-03-18 03:37:58 +11:00
|
|
|
return rewriteValueRISCV64_OpConst8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConstBool:
|
2020-04-20 18:15:50 -04:00
|
|
|
return rewriteValueRISCV64_OpConstBool(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConstNil:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpConstNil(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpConvert:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64MOVconvert
|
|
|
|
|
return true
|
2021-09-09 23:47:14 +01:00
|
|
|
case OpCopysign:
|
|
|
|
|
v.Op = OpRISCV64FSGNJD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt32Fto32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTWS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt32Fto64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTLS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt32Fto64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTDS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt32to32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTSW
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt32to64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTDW
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt64Fto32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTWD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt64Fto32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTSD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt64Fto64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTLD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt64to32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTSL
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpCvt64to64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FCVTDL
|
|
|
|
|
return true
|
2020-02-28 17:04:16 -08:00
|
|
|
case OpCvtBoolToUint8:
|
|
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpDiv16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv16u:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpDiv16u(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv32:
|
2020-04-20 18:15:50 -04:00
|
|
|
return rewriteValueRISCV64_OpDiv32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FDIVS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv32u:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64DIVUW
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv64:
|
2020-04-20 18:15:50 -04:00
|
|
|
return rewriteValueRISCV64_OpDiv64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FDIVD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv64u:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64DIVU
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpDiv8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpDiv8u:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpDiv8u(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEq16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpEq16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEq32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpEq32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEq32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FEQS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEq64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpEq64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEq64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FEQD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEq8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpEq8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEqB:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpEqB(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpEqPtr:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpEqPtr(v)
|
2021-02-17 15:00:34 +00:00
|
|
|
case OpFMA:
|
|
|
|
|
v.Op = OpRISCV64FMADDD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpGetCallerPC:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64LoweredGetCallerPC
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpGetCallerSP:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64LoweredGetCallerSP
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpGetClosurePtr:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64LoweredGetClosurePtr
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpHmul32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpHmul32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpHmul32u:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpHmul32u(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpHmul64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64MULH
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpHmul64u:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64MULHU
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpInterCall:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64CALLinter
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpIsInBounds:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpLess64U
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpIsNonNil:
|
2021-03-06 04:46:07 +11:00
|
|
|
v.Op = OpRISCV64SNEZ
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpIsSliceInBounds:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpLeq64U
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq16U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq16U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FLES
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq32U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq32U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FLED
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq64U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq64U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLeq8U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLeq8U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLess16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess16U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLess16U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLess32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FLTS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess32U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLess32U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SLT
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FLTD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess64U:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SLTU
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLess8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLess8U:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLess8U(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLoad:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLoad(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLocalAddr:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLocalAddr(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh16x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh16x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh16x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh16x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh16x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh16x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh16x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh16x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh32x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh32x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh32x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh32x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh32x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh32x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh32x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh32x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh64x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh64x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh64x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh64x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh64x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh64x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh64x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh64x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh8x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh8x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh8x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh8x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh8x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh8x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpLsh8x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpLsh8x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMod16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod16u:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMod16u(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod32:
|
2020-04-20 18:15:50 -04:00
|
|
|
return rewriteValueRISCV64_OpMod32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod32u:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64REMUW
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod64:
|
2020-04-20 18:15:50 -04:00
|
|
|
return rewriteValueRISCV64_OpMod64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod64u:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64REMU
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMod8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMod8u:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMod8u(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMove:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMove(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMul16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMul16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMul32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64MULW
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMul32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FMULS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMul64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64MUL
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMul64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FMULD
|
|
|
|
|
return true
|
2021-06-22 11:20:03 +00:00
|
|
|
case OpMul64uhilo:
|
|
|
|
|
v.Op = OpRISCV64LoweredMuluhilo
|
|
|
|
|
return true
|
2021-07-31 10:20:10 +00:00
|
|
|
case OpMul64uover:
|
|
|
|
|
v.Op = OpRISCV64LoweredMuluover
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpMul8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpMul8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeg16:
|
2020-03-03 03:45:22 +11:00
|
|
|
v.Op = OpRISCV64NEG
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeg32:
|
2020-03-03 03:45:22 +11:00
|
|
|
v.Op = OpRISCV64NEG
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeg32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FNEGS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeg64:
|
2020-03-03 03:45:22 +11:00
|
|
|
v.Op = OpRISCV64NEG
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeg64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FNEGD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeg8:
|
2020-03-03 03:45:22 +11:00
|
|
|
v.Op = OpRISCV64NEG
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeq16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpNeq16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeq32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpNeq32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeq32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FNES
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeq64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpNeq64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeq64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FNED
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeq8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpNeq8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeqB:
|
2022-08-30 18:59:53 +10:00
|
|
|
return rewriteValueRISCV64_OpNeqB(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNeqPtr:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpNeqPtr(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNilCheck:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64LoweredNilCheck
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpNot:
|
2020-04-26 04:34:34 +10:00
|
|
|
v.Op = OpRISCV64SEQZ
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpOffPtr:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpOffPtr(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpOr16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64OR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpOr32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64OR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpOr64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64OR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpOr8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64OR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpOrB:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64OR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpPanicBounds:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpPanicBounds(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64ADD:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64ADD(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64ADDI:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64ADDI(v)
|
2020-03-10 03:31:22 +11:00
|
|
|
case OpRISCV64AND:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64AND(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64ANDI:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64ANDI(v)
|
2023-06-26 20:46:49 +08:00
|
|
|
case OpRISCV64FADDD:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FADDD(v)
|
2023-06-28 16:45:07 +08:00
|
|
|
case OpRISCV64FADDS:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FADDS(v)
|
2021-02-17 15:00:34 +00:00
|
|
|
case OpRISCV64FMADDD:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FMADDD(v)
|
2023-06-28 16:45:07 +08:00
|
|
|
case OpRISCV64FMADDS:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FMADDS(v)
|
2021-02-17 15:00:34 +00:00
|
|
|
case OpRISCV64FMSUBD:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FMSUBD(v)
|
2023-06-28 16:45:07 +08:00
|
|
|
case OpRISCV64FMSUBS:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FMSUBS(v)
|
2021-02-17 15:00:34 +00:00
|
|
|
case OpRISCV64FNMADDD:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FNMADDD(v)
|
2023-06-28 16:45:07 +08:00
|
|
|
case OpRISCV64FNMADDS:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FNMADDS(v)
|
2021-02-17 15:00:34 +00:00
|
|
|
case OpRISCV64FNMSUBD:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FNMSUBD(v)
|
2023-06-28 16:45:07 +08:00
|
|
|
case OpRISCV64FNMSUBS:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FNMSUBS(v)
|
2023-06-26 20:46:49 +08:00
|
|
|
case OpRISCV64FSUBD:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FSUBD(v)
|
2023-06-28 16:45:07 +08:00
|
|
|
case OpRISCV64FSUBS:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64FSUBS(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVBUload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVBUload(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVBUreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVBUreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVBload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVBload(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVBreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVBreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVBstore:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVBstore(v)
|
2020-03-02 04:26:54 +11:00
|
|
|
case OpRISCV64MOVBstorezero:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVBstorezero(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVDload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVDload(v)
|
2020-12-09 14:59:40 -08:00
|
|
|
case OpRISCV64MOVDnop:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVDnop(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVDreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVDreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVDstore:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVDstore(v)
|
2020-03-02 04:26:54 +11:00
|
|
|
case OpRISCV64MOVDstorezero:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVDstorezero(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVHUload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVHUload(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVHUreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVHUreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVHload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVHload(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVHreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVHreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVHstore:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVHstore(v)
|
2020-03-02 04:26:54 +11:00
|
|
|
case OpRISCV64MOVHstorezero:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVHstorezero(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVWUload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVWUload(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVWUreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVWUreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVWload:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVWload(v)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
case OpRISCV64MOVWreg:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVWreg(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRISCV64MOVWstore:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVWstore(v)
|
2020-03-02 04:26:54 +11:00
|
|
|
case OpRISCV64MOVWstorezero:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64MOVWstorezero(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64NEG:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64NEG(v)
|
|
|
|
|
case OpRISCV64NEGW:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64NEGW(v)
|
2020-03-10 03:31:22 +11:00
|
|
|
case OpRISCV64OR:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64OR(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64ORI:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64ORI(v)
|
2022-08-28 06:08:02 +10:00
|
|
|
case OpRISCV64SEQZ:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SEQZ(v)
|
2020-03-10 03:31:22 +11:00
|
|
|
case OpRISCV64SLL:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SLL(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64SLLI:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SLLI(v)
|
2022-07-29 14:24:26 +08:00
|
|
|
case OpRISCV64SLT:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SLT(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64SLTI:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SLTI(v)
|
|
|
|
|
case OpRISCV64SLTIU:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SLTIU(v)
|
2022-07-29 14:24:26 +08:00
|
|
|
case OpRISCV64SLTU:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SLTU(v)
|
2022-08-28 06:08:02 +10:00
|
|
|
case OpRISCV64SNEZ:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SNEZ(v)
|
2020-03-10 03:31:22 +11:00
|
|
|
case OpRISCV64SRA:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SRA(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64SRAI:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SRAI(v)
|
2020-03-10 03:31:22 +11:00
|
|
|
case OpRISCV64SRL:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SRL(v)
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
case OpRISCV64SRLI:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SRLI(v)
|
2020-03-02 04:23:12 +11:00
|
|
|
case OpRISCV64SUB:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SUB(v)
|
2020-03-02 04:24:35 +11:00
|
|
|
case OpRISCV64SUBW:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64SUBW(v)
|
2020-03-10 03:31:22 +11:00
|
|
|
case OpRISCV64XOR:
|
|
|
|
|
return rewriteValueRISCV64_OpRISCV64XOR(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRotateLeft16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRotateLeft16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRotateLeft32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRotateLeft32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRotateLeft64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRotateLeft64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRotateLeft8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRotateLeft8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRound32F:
|
2023-06-26 20:46:49 +08:00
|
|
|
v.Op = OpRISCV64LoweredRound32F
|
2020-01-23 14:28:04 -08:00
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRound64F:
|
2023-06-26 20:46:49 +08:00
|
|
|
v.Op = OpRISCV64LoweredRound64F
|
2020-01-23 14:28:04 -08:00
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16Ux16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16Ux16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16Ux32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16Ux32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16Ux64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16Ux64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16Ux8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16Ux8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh16x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh16x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32Ux16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32Ux16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32Ux32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32Ux32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32Ux64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32Ux64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32Ux8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32Ux8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh32x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh32x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64Ux16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64Ux16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64Ux32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64Ux32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64Ux64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64Ux64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64Ux8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64Ux8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh64x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh64x8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8Ux16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8Ux16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8Ux32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8Ux32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8Ux64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8Ux64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8Ux8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8Ux8(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8x16:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8x16(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8x32:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8x32(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8x64:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8x64(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpRsh8x8:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpRsh8x8(v)
|
2022-08-24 22:17:51 +08:00
|
|
|
case OpSelect0:
|
|
|
|
|
return rewriteValueRISCV64_OpSelect0(v)
|
|
|
|
|
case OpSelect1:
|
|
|
|
|
return rewriteValueRISCV64_OpSelect1(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSignExt16to32:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVHreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSignExt16to64:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVHreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSignExt32to64:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVWreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSignExt8to16:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVBreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSignExt8to32:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVBreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSignExt8to64:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVBreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSlicemask:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpSlicemask(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSqrt:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FSQRTD
|
|
|
|
|
return true
|
2020-12-07 19:15:15 +08:00
|
|
|
case OpSqrt32:
|
|
|
|
|
v.Op = OpRISCV64FSQRTS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpStaticCall:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64CALLstatic
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpStore:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpStore(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSub16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SUB
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSub32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SUB
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSub32F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FSUBS
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSub64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SUB
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSub64F:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64FSUBD
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSub8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SUB
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpSubPtr:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64SUB
|
|
|
|
|
return true
|
cmd/compile: restore tail call for method wrappers
For certain type of method wrappers we used to generate a tail
call. That was disabled in CL 307234 when register ABI is used,
because with the current IR it was difficult to generate a tail
call with the arguments in the right places. The problem was that
the IR does not contain a CALL-like node with arguments; instead,
it contains an OAS node that adjusts the receiver, than an
OTAILCALL node that just contains the target, but no argument
(with the assumption that the OAS node will put the adjusted
receiver in the right place). With register ABI, putting
arguments in registers are done in SSA. The assignment (OAS)
doesn't put the receiver in register.
This CL changes the IR of a tail call to take an actual OCALL
node. Specifically, a tail call is represented as
OTAILCALL (OCALL target args...)
This way, the call target and args are connected through the OCALL
node. So the call can be analyzed in SSA and the args can be passed
in the right places.
(Alternatively, we could have OTAILCALL node directly take the
target and the args, without the OCALL node. Using an OCALL node is
convenient as there are existing code that processes OCALL nodes
which do not need to be changed. Also, a tail call is similar to
ORETURN (OCALL target args...), except it doesn't preserve the
frame. I did the former but I'm open to change.)
The SSA representation is similar. Previously, the IR lowers to
a Store the receiver then a BlockRetJmp which jumps to the target
(without putting the arg in register). Now we use a TailCall op,
which takes the target and the args. The call expansion pass and
the register allocator handles TailCall pretty much like a
StaticCall, and it will do the right ABI analysis and put the args
in the right places. (Args other than the receiver are already in
the right places. For register args it generates no code for them.
For stack args currently it generates a self copy. I'll work on
optimize that out.) BlockRetJmp is still used, signaling it is a
tail call. The actual call is made in the TailCall op so
BlockRetJmp generates no code (we could use BlockExit if we like).
This slightly reduces binary size:
old new
cmd/go 14003088 13953936
cmd/link 6275552 6271456
Change-Id: I2d16d8d419fe1f17554916d317427383e17e27f0
Reviewed-on: https://go-review.googlesource.com/c/go/+/350145
Trust: Cherry Mui <cherryyz@google.com>
Run-TryBot: Cherry Mui <cherryyz@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: David Chase <drchase@google.com>
2021-09-10 22:05:55 -04:00
|
|
|
case OpTailCall:
|
|
|
|
|
v.Op = OpRISCV64CALLtail
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpTrunc16to8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpTrunc32to16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpTrunc32to8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpTrunc64to16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpTrunc64to32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpTrunc64to8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpCopy
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpWB:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64LoweredWB
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpXor16:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64XOR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpXor32:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64XOR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpXor64:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64XOR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpXor8:
|
2020-01-23 14:28:04 -08:00
|
|
|
v.Op = OpRISCV64XOR
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZero:
|
2020-01-21 20:53:30 -08:00
|
|
|
return rewriteValueRISCV64_OpZero(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZeroExt16to32:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVHUreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZeroExt16to64:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVHUreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZeroExt32to64:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVWUreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZeroExt8to16:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVBUreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZeroExt8to32:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVBUreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
case OpZeroExt8to64:
|
2020-10-25 00:32:23 +11:00
|
|
|
v.Op = OpRISCV64MOVBUreg
|
|
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-04-12 20:05:14 -07:00
|
|
|
func rewriteValueRISCV64_OpAddr(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (Addr {sym} base)
|
|
|
|
|
// result: (MOVaddr {sym} [0] base)
|
|
|
|
|
for {
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
base := v_0
|
|
|
|
|
v.reset(OpRISCV64MOVaddr)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(0)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg(base)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-02-27 19:07:32 +11:00
|
|
|
func rewriteValueRISCV64_OpAtomicAnd8(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (AtomicAnd8 ptr val mem)
|
|
|
|
|
// result: (LoweredAtomicAnd32 (ANDI <typ.Uintptr> [^3] ptr) (NOT <typ.UInt32> (SLL <typ.UInt32> (XORI <typ.UInt32> [0xff] (ZeroExt8to32 val)) (SLLI <typ.UInt64> [3] (ANDI <typ.UInt64> [3] ptr)))) mem)
|
|
|
|
|
for {
|
|
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64LoweredAtomicAnd32)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ANDI, typ.Uintptr)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(^3)
|
|
|
|
|
v0.AddArg(ptr)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64NOT, typ.UInt32)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLL, typ.UInt32)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64XORI, typ.UInt32)
|
|
|
|
|
v3.AuxInt = int64ToAuxInt(0xff)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
|
|
|
|
|
v4.AddArg(val)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v5 := b.NewValue0(v.Pos, OpRISCV64SLLI, typ.UInt64)
|
|
|
|
|
v5.AuxInt = int64ToAuxInt(3)
|
|
|
|
|
v6 := b.NewValue0(v.Pos, OpRISCV64ANDI, typ.UInt64)
|
|
|
|
|
v6.AuxInt = int64ToAuxInt(3)
|
|
|
|
|
v6.AddArg(ptr)
|
|
|
|
|
v5.AddArg(v6)
|
|
|
|
|
v2.AddArg2(v3, v5)
|
|
|
|
|
v1.AddArg(v2)
|
|
|
|
|
v.AddArg3(v0, v1, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
cmd/compile: sign-extend the 2nd argument of the LoweredAtomicCas32 on loong64,mips64x,riscv64
The function LoweredAtomicCas32 is implemented using the LL-SC instruction pair
on loong64, mips64x, riscv64. However,the LL instruction on loong64, mips64x,
riscv64 is sign-extended, so it is necessary to sign-extend the 2nd parameter
"old" of the LoweredAtomicCas32, so that the instruction BNE after LL can get
the desired result.
The function prototype of LoweredAtomicCas32 in golang:
func Cas32(ptr *uint32, old, new uint32) bool
When using an intrinsify implementation:
case 1: (*ptr) <= 0x80000000 && old < 0x80000000
E.g: (*ptr) = 0x7FFFFFFF, old = Rarg1= 0x7FFFFFFF
After run the instruction "LL (Rarg0), Rtmp": Rtmp = 0x7FFFFFFF
Rtmp ! = Rarg1(old) is false, the result we expect
case 2: (*ptr) >= 0x80000000 && old >= 0x80000000
E.g: (*ptr) = 0x80000000, old = Rarg1= 0x80000000
After run the instruction "LL (Rarg0), Rtmp": Rtmp = 0xFFFFFFFF_80000000
Rtmp ! = Rarg1(old) is true, which we do not expect
When using an non-intrinsify implementation:
Because Rarg1 is loaded from the stack using sign-extended instructions
ld.w, the situation described in Case 2 above does not occur
Benchmarks on linux/loong64:
name old time/op new time/op delta
Cas 50.0ns ± 0% 50.1ns ± 0% ~ (p=1.000 n=1+1)
Cas64 50.0ns ± 0% 50.1ns ± 0% ~ (p=1.000 n=1+1)
Cas-4 56.0ns ± 0% 56.0ns ± 0% ~ (p=1.000 n=1+1)
Cas64-4 56.0ns ± 0% 56.0ns ± 0% ~ (p=1.000 n=1+1)
Benchmarks on Loongson 3A4000 (GOARCH=mips64le, 1.8GHz)
name old time/op new time/op delta
Cas 70.4ns ± 0% 70.3ns ± 0% ~ (p=1.000 n=1+1)
Cas64 70.7ns ± 0% 70.6ns ± 0% ~ (p=1.000 n=1+1)
Cas-4 81.1ns ± 0% 80.8ns ± 0% ~ (p=1.000 n=1+1)
Cas64-4 80.9ns ± 0% 80.9ns ± 0% ~ (p=1.000 n=1+1)
Fixes #57282
Change-Id: I190a7fc648023b15fa392f7fdda5ac18c1561bac
Reviewed-on: https://go-review.googlesource.com/c/go/+/457135
Run-TryBot: Than McIntosh <thanm@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Wayne Zuo <wdvxdr@golangcn.org>
Reviewed-by: Than McIntosh <thanm@google.com>
Reviewed-by: David Chase <drchase@google.com>
2022-12-19 05:04:48 +08:00
|
|
|
func rewriteValueRISCV64_OpAtomicCompareAndSwap32(v *Value) bool {
|
|
|
|
|
v_3 := v.Args[3]
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (AtomicCompareAndSwap32 ptr old new mem)
|
|
|
|
|
// result: (LoweredAtomicCas32 ptr (SignExt32to64 old) new mem)
|
|
|
|
|
for {
|
|
|
|
|
ptr := v_0
|
|
|
|
|
old := v_1
|
|
|
|
|
new := v_2
|
|
|
|
|
mem := v_3
|
|
|
|
|
v.reset(OpRISCV64LoweredAtomicCas32)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(old)
|
|
|
|
|
v.AddArg4(ptr, v0, new, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-02-27 19:07:32 +11:00
|
|
|
func rewriteValueRISCV64_OpAtomicOr8(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (AtomicOr8 ptr val mem)
|
|
|
|
|
// result: (LoweredAtomicOr32 (ANDI <typ.Uintptr> [^3] ptr) (SLL <typ.UInt32> (ZeroExt8to32 val) (SLLI <typ.UInt64> [3] (ANDI <typ.UInt64> [3] ptr))) mem)
|
|
|
|
|
for {
|
|
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64LoweredAtomicOr32)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ANDI, typ.Uintptr)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(^3)
|
|
|
|
|
v0.AddArg(ptr)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64SLL, typ.UInt32)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
|
|
|
|
|
v2.AddArg(val)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLLI, typ.UInt64)
|
|
|
|
|
v3.AuxInt = int64ToAuxInt(3)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64ANDI, typ.UInt64)
|
|
|
|
|
v4.AuxInt = int64ToAuxInt(3)
|
|
|
|
|
v4.AddArg(ptr)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v1.AddArg2(v2, v3)
|
|
|
|
|
v.AddArg3(v0, v1, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpAvg64u(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Avg64u <t> x y)
|
|
|
|
|
// result: (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64ADD)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ADD, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64SRLI, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SRLI, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64ANDI, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64AND, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v4.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(v4)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v3)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
func rewriteValueRISCV64_OpConst16(v *Value) bool {
|
|
|
|
|
// match: (Const16 [val])
|
|
|
|
|
// result: (MOVDconst [int64(val)])
|
|
|
|
|
for {
|
|
|
|
|
val := auxIntToInt16(v.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(val))
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpConst32(v *Value) bool {
|
|
|
|
|
// match: (Const32 [val])
|
|
|
|
|
// result: (MOVDconst [int64(val)])
|
|
|
|
|
for {
|
|
|
|
|
val := auxIntToInt32(v.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(val))
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpConst32F(v *Value) bool {
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Const32F [val])
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (FMVSX (MOVDconst [int64(math.Float32bits(val))]))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToFloat32(v.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64FMVSX)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(int64(math.Float32bits(val)))
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
func rewriteValueRISCV64_OpConst64(v *Value) bool {
|
|
|
|
|
// match: (Const64 [val])
|
|
|
|
|
// result: (MOVDconst [int64(val)])
|
|
|
|
|
for {
|
|
|
|
|
val := auxIntToInt64(v.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(val))
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpConst64F(v *Value) bool {
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Const64F [val])
|
2020-04-20 18:15:50 -04:00
|
|
|
// result: (FMVDX (MOVDconst [int64(math.Float64bits(val))]))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToFloat64(v.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64FMVDX)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
2020-04-20 18:15:50 -04:00
|
|
|
v0.AuxInt = int64ToAuxInt(int64(math.Float64bits(val)))
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
func rewriteValueRISCV64_OpConst8(v *Value) bool {
|
|
|
|
|
// match: (Const8 [val])
|
|
|
|
|
// result: (MOVDconst [int64(val)])
|
|
|
|
|
for {
|
|
|
|
|
val := auxIntToInt8(v.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(val))
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
func rewriteValueRISCV64_OpConstBool(v *Value) bool {
|
|
|
|
|
// match: (ConstBool [val])
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVDconst [int64(b2i(val))])
|
2020-04-20 18:15:50 -04:00
|
|
|
for {
|
|
|
|
|
val := auxIntToBool(v.AuxInt)
|
2021-03-18 03:37:58 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(b2i(val)))
|
2020-04-20 18:15:50 -04:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpConstNil(v *Value) bool {
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (ConstNil)
|
|
|
|
|
// result: (MOVDconst [0])
|
|
|
|
|
for {
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpDiv16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
2020-04-20 18:15:50 -04:00
|
|
|
// match: (Div16 x y [false])
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (DIVW (SignExt16to32 x) (SignExt16to32 y))
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToBool(v.AuxInt) != false {
|
|
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64DIVW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to32, typ.Int32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt16to32, typ.Int32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpDiv16u(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Div16u x y)
|
|
|
|
|
// result: (DIVUW (ZeroExt16to32 x) (ZeroExt16to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64DIVUW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
func rewriteValueRISCV64_OpDiv32(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (Div32 x y [false])
|
|
|
|
|
// result: (DIVW x y)
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToBool(v.AuxInt) != false {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
v.reset(OpRISCV64DIVW)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpDiv64(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (Div64 x y [false])
|
|
|
|
|
// result: (DIV x y)
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToBool(v.AuxInt) != false {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
v.reset(OpRISCV64DIV)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpDiv8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Div8 x y)
|
|
|
|
|
// result: (DIVW (SignExt8to32 x) (SignExt8to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64DIVW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to32, typ.Int32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt8to32, typ.Int32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpDiv8u(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Div8u x y)
|
|
|
|
|
// result: (DIVUW (ZeroExt8to32 x) (ZeroExt8to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64DIVUW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpEq16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Eq16 x y)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// result: (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SEQZ)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v2.AddArg(y)
|
|
|
|
|
v0.AddArg2(v1, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpEq32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
cmd/compile: change riscv64 Eq32/Neq32 to zero extend before subtraction
As done with other equality tests, zero extend before subtraction rather than
after (or in this case, at the same time). While at face value this appears to
require more instructions, in reality it allows for most sign extensions to
be completely eliminated due to correctly typed loads. Existing optimisations
(such as subtraction of zero) then become more effective.
This removes more than 10,000 instructions from the Go binary and in particular,
a writeBarrier check only requires three instructions (AUIPC, LWU, BNEZ) instead
of the current four (AUIPC, LWU, NEGW, BNEZ).
Change-Id: I7afdc1921c4916ddbd414c3b3f5c2089107ec016
Reviewed-on: https://go-review.googlesource.com/c/go/+/274066
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
2020-11-30 23:47:27 +11:00
|
|
|
typ := &b.Func.Config.Types
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Eq32 x y)
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
// cond: x.Type.IsSigned()
|
|
|
|
|
// result: (SEQZ (SUB <x.Type> (SignExt32to64 x) (SignExt32to64 y)))
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(x.Type.IsSigned()) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SEQZ)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v2.AddArg(y)
|
|
|
|
|
v0.AddArg2(v1, v2)
|
|
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (Eq32 x y)
|
|
|
|
|
// cond: !x.Type.IsSigned()
|
cmd/compile: change riscv64 Eq32/Neq32 to zero extend before subtraction
As done with other equality tests, zero extend before subtraction rather than
after (or in this case, at the same time). While at face value this appears to
require more instructions, in reality it allows for most sign extensions to
be completely eliminated due to correctly typed loads. Existing optimisations
(such as subtraction of zero) then become more effective.
This removes more than 10,000 instructions from the Go binary and in particular,
a writeBarrier check only requires three instructions (AUIPC, LWU, BNEZ) instead
of the current four (AUIPC, LWU, NEGW, BNEZ).
Change-Id: I7afdc1921c4916ddbd414c3b3f5c2089107ec016
Reviewed-on: https://go-review.googlesource.com/c/go/+/274066
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
2020-11-30 23:47:27 +11:00
|
|
|
// result: (SEQZ (SUB <x.Type> (ZeroExt32to64 x) (ZeroExt32to64 y)))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(!x.Type.IsSigned()) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SEQZ)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v2.AddArg(y)
|
|
|
|
|
v0.AddArg2(v1, v2)
|
|
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpEq64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Eq64 x y)
|
|
|
|
|
// result: (SEQZ (SUB <x.Type> x y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SEQZ)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpEq8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Eq8 x y)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// result: (SEQZ (SUB <x.Type> (ZeroExt8to64 x) (ZeroExt8to64 y)))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SEQZ)
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, x.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v2.AddArg(y)
|
|
|
|
|
v0.AddArg2(v1, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpEqB(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (EqB x y)
|
2022-08-30 18:59:53 +10:00
|
|
|
// result: (SEQZ (SUB <typ.Bool> x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2020-04-26 04:34:34 +10:00
|
|
|
v.reset(OpRISCV64SEQZ)
|
2022-08-30 18:59:53 +10:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpEqPtr(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
2022-02-14 12:43:27 -05:00
|
|
|
typ := &b.Func.Config.Types
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (EqPtr x y)
|
2022-02-14 12:43:27 -05:00
|
|
|
// result: (SEQZ (SUB <typ.Uintptr> x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SEQZ)
|
2022-02-14 12:43:27 -05:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.Uintptr)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpHmul32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Hmul32 x y)
|
|
|
|
|
// result: (SRAI [32] (MUL (SignExt32to64 x) (SignExt32to64 y)))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRAI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(32)
|
2019-11-04 04:40:47 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MUL, typ.Int64)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v2.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpHmul32u(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Hmul32u x y)
|
|
|
|
|
// result: (SRLI [32] (MUL (ZeroExt32to64 x) (ZeroExt32to64 y)))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2020-01-23 14:28:04 -08:00
|
|
|
v.reset(OpRISCV64SRLI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(32)
|
2020-01-23 14:28:04 -08:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MUL, typ.Int64)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v2.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, v2)
|
2020-01-23 14:28:04 -08:00
|
|
|
v.AddArg(v0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq16 x y)
|
|
|
|
|
// result: (Not (Less16 y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess16, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq16U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq16U x y)
|
|
|
|
|
// result: (Not (Less16U y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess16U, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq32 x y)
|
|
|
|
|
// result: (Not (Less32 y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess32, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq32U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq32U x y)
|
|
|
|
|
// result: (Not (Less32U y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess32U, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq64 x y)
|
|
|
|
|
// result: (Not (Less64 y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess64, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq64U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq64U x y)
|
|
|
|
|
// result: (Not (Less64U y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess64U, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq8 x y)
|
|
|
|
|
// result: (Not (Less8 y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess8, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLeq8U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Leq8U x y)
|
|
|
|
|
// result: (Not (Less8U y x))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLess8U, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLess16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Less16 x y)
|
|
|
|
|
// result: (SLT (SignExt16to64 x) (SignExt16to64 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SLT)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLess16U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Less16U x y)
|
|
|
|
|
// result: (SLTU (ZeroExt16to64 x) (ZeroExt16to64 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SLTU)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLess32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Less32 x y)
|
|
|
|
|
// result: (SLT (SignExt32to64 x) (SignExt32to64 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SLT)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLess32U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Less32U x y)
|
|
|
|
|
// result: (SLTU (ZeroExt32to64 x) (ZeroExt32to64 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SLTU)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLess8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Less8 x y)
|
|
|
|
|
// result: (SLT (SignExt8to64 x) (SignExt8to64 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SLT)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLess8U(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Less8U x y)
|
|
|
|
|
// result: (SLTU (ZeroExt8to64 x) (ZeroExt8to64 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SLTU)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLoad(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Load <t> ptr mem)
|
|
|
|
|
// cond: t.IsBoolean()
|
|
|
|
|
// result: (MOVBUload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(t.IsBoolean()) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBUload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
2023-04-09 08:36:12 -07:00
|
|
|
// cond: ( is8BitInt(t) && t.IsSigned())
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVBload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2023-04-09 08:36:12 -07:00
|
|
|
if !(is8BitInt(t) && t.IsSigned()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
2023-04-09 08:36:12 -07:00
|
|
|
// cond: ( is8BitInt(t) && !t.IsSigned())
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVBUload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2023-04-09 08:36:12 -07:00
|
|
|
if !(is8BitInt(t) && !t.IsSigned()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBUload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
2023-04-09 08:36:12 -07:00
|
|
|
// cond: (is16BitInt(t) && t.IsSigned())
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVHload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2023-04-09 08:36:12 -07:00
|
|
|
if !(is16BitInt(t) && t.IsSigned()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
2023-04-09 08:36:12 -07:00
|
|
|
// cond: (is16BitInt(t) && !t.IsSigned())
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVHUload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2023-04-09 08:36:12 -07:00
|
|
|
if !(is16BitInt(t) && !t.IsSigned()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHUload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
2023-04-09 08:36:12 -07:00
|
|
|
// cond: (is32BitInt(t) && t.IsSigned())
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVWload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2023-04-09 08:36:12 -07:00
|
|
|
if !(is32BitInt(t) && t.IsSigned()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
2023-04-09 08:36:12 -07:00
|
|
|
// cond: (is32BitInt(t) && !t.IsSigned())
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVWUload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2023-04-09 08:36:12 -07:00
|
|
|
if !(is32BitInt(t) && !t.IsSigned()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWUload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
|
|
|
|
// cond: (is64BitInt(t) || isPtr(t))
|
|
|
|
|
// result: (MOVDload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(is64BitInt(t) || isPtr(t)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
|
|
|
|
// cond: is32BitFloat(t)
|
|
|
|
|
// result: (FMOVWload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(is32BitFloat(t)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMOVWload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Load <t> ptr mem)
|
|
|
|
|
// cond: is64BitFloat(t)
|
|
|
|
|
// result: (FMOVDload ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(is64BitFloat(t)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMOVDload)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(ptr, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLocalAddr(v *Value) bool {
|
2022-11-21 22:22:36 -08:00
|
|
|
v_1 := v.Args[1]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_0 := v.Args[0]
|
2022-11-21 22:22:36 -08:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (LocalAddr <t> {sym} base mem)
|
|
|
|
|
// cond: t.Elem().HasPointers()
|
|
|
|
|
// result: (MOVaddr {sym} (SPanchored base mem))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
base := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Elem().HasPointers()) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVaddr)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSPanchored, typ.Uintptr)
|
|
|
|
|
v0.AddArg2(base, mem)
|
|
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (LocalAddr <t> {sym} base _)
|
|
|
|
|
// cond: !t.Elem().HasPointers()
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVaddr {sym} base)
|
|
|
|
|
for {
|
2022-11-21 22:22:36 -08:00
|
|
|
t := v.Type
|
2020-04-20 18:15:50 -04:00
|
|
|
sym := auxToSym(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
base := v_0
|
2022-11-21 22:22:36 -08:00
|
|
|
if !(!t.Elem().HasPointers()) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVaddr)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.Aux = symToAux(sym)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(base)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-11-21 22:22:36 -08:00
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh16x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh16x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh16x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh16x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh16x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh16x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh16x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Lsh16x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh16x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh16x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh16x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh16x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh32x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh32x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh32x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh32x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh32x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh32x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh32x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Lsh32x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh32x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh32x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh32x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh32x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh64x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh64x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh64x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh64x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh64x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh64x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh64x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Lsh64x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh64x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh64x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh64x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh64x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh8x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh8x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh8x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh8x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh8x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh8x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh8x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Lsh8x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh8x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpLsh8x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Lsh8x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SLL <t> x y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Lsh8x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SLL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMod16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
2020-04-20 18:15:50 -04:00
|
|
|
// match: (Mod16 x y [false])
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (REMW (SignExt16to32 x) (SignExt16to32 y))
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToBool(v.AuxInt) != false {
|
|
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64REMW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to32, typ.Int32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt16to32, typ.Int32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMod16u(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Mod16u x y)
|
|
|
|
|
// result: (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64REMUW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to32, typ.UInt32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
func rewriteValueRISCV64_OpMod32(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (Mod32 x y [false])
|
|
|
|
|
// result: (REMW x y)
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToBool(v.AuxInt) != false {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
v.reset(OpRISCV64REMW)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpMod64(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (Mod64 x y [false])
|
|
|
|
|
// result: (REM x y)
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToBool(v.AuxInt) != false {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
v.reset(OpRISCV64REM)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMod8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Mod8 x y)
|
|
|
|
|
// result: (REMW (SignExt8to32 x) (SignExt8to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64REMW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to32, typ.Int32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt8to32, typ.Int32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMod8u(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Mod8u x y)
|
|
|
|
|
// result: (REMUW (ZeroExt8to32 x) (ZeroExt8to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64REMUW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to32, typ.UInt32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMove(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
config := b.Func.Config
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Move [0] _ _ mem)
|
|
|
|
|
// result: mem
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 0 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_2
|
2019-10-30 10:29:47 -07:00
|
|
|
v.copyOf(mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [1] dst src mem)
|
|
|
|
|
// result: (MOVBstore dst (MOVBload src mem) mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 1 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v.AddArg3(dst, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Move [2] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVHstore dst (MOVHload src mem) mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 2 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v.AddArg3(dst, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Move [2] dst src mem)
|
|
|
|
|
// result: (MOVBstore [1] dst (MOVBload [1] src mem) (MOVBstore dst (MOVBload src mem) mem))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 2 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, mem)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [4] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%4 == 0
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVWstore dst (MOVWload src mem) mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 4 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(t.Alignment()%4 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v.AddArg3(dst, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Move [4] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
|
|
|
|
// result: (MOVHstore [2] dst (MOVHload [2] src mem) (MOVHstore dst (MOVHload src mem) mem))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 4 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, mem)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [4] dst src mem)
|
|
|
|
|
// result: (MOVBstore [3] dst (MOVBload [3] src mem) (MOVBstore [2] dst (MOVBload [2] src mem) (MOVBstore [1] dst (MOVBload [1] src mem) (MOVBstore dst (MOVBload src mem) mem))))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 4 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(3)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(3)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v3.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v4.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v5 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v6 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v6.AddArg2(src, mem)
|
|
|
|
|
v5.AddArg3(dst, v6, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, v5)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [8] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVDstore dst (MOVDload src mem) mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 8 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v.AddArg3(dst, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Move [8] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%4 == 0
|
|
|
|
|
// result: (MOVWstore [4] dst (MOVWload [4] src mem) (MOVWstore dst (MOVWload src mem) mem))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 8 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%4 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, mem)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [8] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
|
|
|
|
// result: (MOVHstore [6] dst (MOVHload [6] src mem) (MOVHstore [4] dst (MOVHload [4] src mem) (MOVHstore [2] dst (MOVHload [2] src mem) (MOVHstore dst (MOVHload src mem) mem))))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 8 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(6)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(6)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v3.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v4.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v5 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v6 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v6.AddArg2(src, mem)
|
|
|
|
|
v5.AddArg3(dst, v6, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, v5)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [3] dst src mem)
|
|
|
|
|
// result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVBstore [1] dst (MOVBload [1] src mem) (MOVBstore dst (MOVBload src mem) mem)))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 3 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVBload, typ.Int8)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [6] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
|
|
|
|
// result: (MOVHstore [4] dst (MOVHload [4] src mem) (MOVHstore [2] dst (MOVHload [2] src mem) (MOVHstore dst (MOVHload src mem) mem)))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 6 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVHload, typ.Int16)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [12] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%4 == 0
|
|
|
|
|
// result: (MOVWstore [8] dst (MOVWload [8] src mem) (MOVWstore [4] dst (MOVWload [4] src mem) (MOVWstore dst (MOVWload src mem) mem)))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 12 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%4 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVWload, typ.Int32)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [16] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 16 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, mem)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [24] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem)))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 24 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(16)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(16)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, mem)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [32] {t} dst src mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore [24] dst (MOVDload [24] src mem) (MOVDstore [16] dst (MOVDload [16] src mem) (MOVDstore [8] dst (MOVDload [8] src mem) (MOVDstore dst (MOVDload src mem) mem))))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 32 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
|
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(24)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(24)
|
|
|
|
|
v0.AddArg2(src, mem)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(16)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(16)
|
|
|
|
|
v2.AddArg2(src, mem)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v3.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v4 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v4.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v4.AddArg2(src, mem)
|
|
|
|
|
v5 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v6 := b.NewValue0(v.Pos, OpRISCV64MOVDload, typ.Int64)
|
|
|
|
|
v6.AddArg2(src, mem)
|
|
|
|
|
v5.AddArg3(dst, v6, mem)
|
|
|
|
|
v3.AddArg3(dst, v4, v5)
|
|
|
|
|
v1.AddArg3(dst, v2, v3)
|
|
|
|
|
v.AddArg3(dst, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Move [s] {t} dst src mem)
|
2020-10-29 01:10:49 +01:00
|
|
|
// cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)
|
2020-06-14 00:06:24 +02:00
|
|
|
// result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
|
|
|
|
|
for {
|
|
|
|
|
s := auxIntToInt64(v.AuxInt)
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice && logLargeCopy(v, s)) {
|
2020-06-14 00:06:24 +02:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64DUFFCOPY)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
|
|
|
|
|
v.AddArg3(dst, src, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Move [s] {t} dst src mem)
|
2020-04-13 01:39:45 +10:00
|
|
|
// cond: (s <= 16 || logLargeCopy(v, s))
|
2020-04-20 18:15:50 -04:00
|
|
|
// result: (LoweredMove [t.Alignment()] dst src (ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src) mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
s := auxIntToInt64(v.AuxInt)
|
|
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
dst := v_0
|
|
|
|
|
src := v_1
|
|
|
|
|
mem := v_2
|
2020-04-13 01:39:45 +10:00
|
|
|
if !(s <= 16 || logLargeCopy(v, s)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64LoweredMove)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(t.Alignment())
|
2019-11-04 04:40:47 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ADDI, src.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v0.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config))
|
2019-11-04 04:40:47 +11:00
|
|
|
v0.AddArg(src)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg4(dst, src, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-04-13 01:39:45 +10:00
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMul16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Mul16 x y)
|
|
|
|
|
// result: (MULW (SignExt16to32 x) (SignExt16to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MULW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to32, typ.Int32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt16to32, typ.Int32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpMul8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Mul8 x y)
|
|
|
|
|
// result: (MULW (SignExt8to32 x) (SignExt8to32 y))
|
|
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MULW)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to32, typ.Int32)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpSignExt8to32, typ.Int32)
|
|
|
|
|
v1.AddArg(y)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpNeq16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Neq16 x y)
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
// result: (Not (Eq16 x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpEq16, typ.Bool)
|
|
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpNeq32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
cmd/compile: change riscv64 Eq32/Neq32 to zero extend before subtraction
As done with other equality tests, zero extend before subtraction rather than
after (or in this case, at the same time). While at face value this appears to
require more instructions, in reality it allows for most sign extensions to
be completely eliminated due to correctly typed loads. Existing optimisations
(such as subtraction of zero) then become more effective.
This removes more than 10,000 instructions from the Go binary and in particular,
a writeBarrier check only requires three instructions (AUIPC, LWU, BNEZ) instead
of the current four (AUIPC, LWU, NEGW, BNEZ).
Change-Id: I7afdc1921c4916ddbd414c3b3f5c2089107ec016
Reviewed-on: https://go-review.googlesource.com/c/go/+/274066
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
2020-11-30 23:47:27 +11:00
|
|
|
typ := &b.Func.Config.Types
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Neq32 x y)
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
// result: (Not (Eq32 x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpEq32, typ.Bool)
|
|
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpNeq64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
typ := &b.Func.Config.Types
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Neq64 x y)
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
// result: (Not (Eq64 x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpEq64, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpNeq8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Neq8 x y)
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
// result: (Not (Eq8 x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpEq8, typ.Bool)
|
|
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-08-30 18:59:53 +10:00
|
|
|
func rewriteValueRISCV64_OpNeqB(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (NeqB x y)
|
|
|
|
|
// result: (SNEZ (SUB <typ.Bool> x y))
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
v.reset(OpRISCV64SNEZ)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.Bool)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpNeqPtr(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
2022-02-14 12:43:27 -05:00
|
|
|
typ := &b.Func.Config.Types
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (NeqPtr x y)
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
// result: (Not (EqPtr x y))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
cmd/compile: sign or zero extend for 32 bit equality on riscv64
For 32 bit equality (Eq32), rather than always zero extending to 64 bits,
sign extend for signed types and zero extend for unsigned types. This makes
no difference to the equality test (via SUB), however it increases the
likelihood of avoiding unnecessary sign or zero extension simply for the
purpose of equality testing.
While here, replace the Neq* rules with (Not (Eq*)) - this makes no
difference to the generated code (as the intermediates get expanded and
eliminated), however it means that changes to the equality rules also
reflect in the inequality rules.
As an example, the following:
lw t0,956(t0)
slli t0,t0,0x20
srli t0,t0,0x20
li t1,1
bne t1,t0,278fc
Becomes:
lw t0,1024(t0)
li t1,1
bne t1,t0,278b0
Removes almost 1000 instructions from the Go binary on riscv64.
Change-Id: Iac60635f494f6db87faa47752bd1cc16e6b5967f
Reviewed-on: https://go-review.googlesource.com/c/go/+/516595
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2022-10-19 16:43:22 +11:00
|
|
|
v.reset(OpNot)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpEqPtr, typ.Bool)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpOffPtr(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (OffPtr [off] ptr:(SP))
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(off)
|
|
|
|
|
// result: (MOVaddr [int32(off)] ptr)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt64(v.AuxInt)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
2020-04-20 18:15:50 -04:00
|
|
|
if ptr.Op != OpSP || !(is32Bit(off)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVaddr)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(int32(off))
|
2020-01-23 14:28:04 -08:00
|
|
|
v.AddArg(ptr)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-01-23 14:28:04 -08:00
|
|
|
// match: (OffPtr [off] ptr)
|
|
|
|
|
// cond: is32Bit(off)
|
|
|
|
|
// result: (ADDI [off] ptr)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt64(v.AuxInt)
|
2020-01-23 14:28:04 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
if !(is32Bit(off)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ADDI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(off)
|
2020-01-23 14:28:04 -08:00
|
|
|
v.AddArg(ptr)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-01-23 14:28:04 -08:00
|
|
|
// match: (OffPtr [off] ptr)
|
|
|
|
|
// result: (ADD (MOVDconst [off]) ptr)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt64(v.AuxInt)
|
2020-01-23 14:28:04 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
v.reset(OpRISCV64ADD)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
2020-04-20 18:15:50 -04:00
|
|
|
v0.AuxInt = int64ToAuxInt(off)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, ptr)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpPanicBounds(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (PanicBounds [kind] x y mem)
|
|
|
|
|
// cond: boundsABI(kind) == 0
|
|
|
|
|
// result: (LoweredPanicBoundsA [kind] x y mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
kind := auxIntToInt64(v.AuxInt)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
mem := v_2
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(boundsABI(kind) == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64LoweredPanicBoundsA)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(kind)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(x, y, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (PanicBounds [kind] x y mem)
|
|
|
|
|
// cond: boundsABI(kind) == 1
|
|
|
|
|
// result: (LoweredPanicBoundsB [kind] x y mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
kind := auxIntToInt64(v.AuxInt)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
mem := v_2
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(boundsABI(kind) == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64LoweredPanicBoundsB)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(kind)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(x, y, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (PanicBounds [kind] x y mem)
|
|
|
|
|
// cond: boundsABI(kind) == 2
|
|
|
|
|
// result: (LoweredPanicBoundsC [kind] x y mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
kind := auxIntToInt64(v.AuxInt)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
mem := v_2
|
2019-11-04 04:40:47 +11:00
|
|
|
if !(boundsABI(kind) == 2) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64LoweredPanicBoundsC)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(kind)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(x, y, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64ADD(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2023-02-15 10:21:52 -08:00
|
|
|
// match: (ADD (MOVDconst <t> [val]) x)
|
|
|
|
|
// cond: is32Bit(val) && !t.IsPtr()
|
2020-03-10 03:31:22 +11:00
|
|
|
// result: (ADDI [val] x)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
cmd/compile: use loops to handle commutative ops in rules
Prior to this change, we generated additional rules at rulegen time
for all possible combinations of args to commutative ops.
This is simple and works well, but leads to lots of generated rules.
This in turn has increased the size of the compiler,
made it hard to compile package ssa on small machines,
and provided a disincentive to mark some ops as commutative.
This change reworks how we handle commutative ops.
Instead of generating a rule per argument permutation,
we generate a series of nested loops, one for each commutative op.
Each loop tries both possible argument orderings.
I also considered attempting to canonicalize the inputs to the
rewrite rules. However, because either or both arguments might be
nothing more than an identifier, and because there can be arbitrary
conditions to evaluate during matching, I did not see how to proceed.
The duplicate rule detection now sorts arguments to commutative ops,
so that it can detect commutative-only duplicates.
There may be further optimizations to the new generated code.
In particular, we may not be removing as many bounds checks as before;
I have not investigated deeply. If more work here is needed,
we could do it with more hints or with improvements to the prove pass.
This change has almost no impact on the generated code.
It does not pass toolstash-check, however. In a handful of functions,
for reasons I do not understand, there are minor position changes.
For the entire series ending at this change,
there is negligible compiler performance impact.
The compiler binary shrinks by about 15%,
and package ssa shrinks by about 25%.
Package ssa also compiles ~25% faster with ~25% less memory.
Change-Id: Ia2ee9ceae7be08a17342319d4e31b0bb238a2ee4
Reviewed-on: https://go-review.googlesource.com/c/go/+/213703
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-06 22:24:02 -08:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2023-02-15 10:21:52 -08:00
|
|
|
t := v_0.Type
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_0.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
x := v_1
|
2023-02-15 10:21:52 -08:00
|
|
|
if !(is32Bit(val) && !t.IsPtr()) {
|
cmd/compile: use loops to handle commutative ops in rules
Prior to this change, we generated additional rules at rulegen time
for all possible combinations of args to commutative ops.
This is simple and works well, but leads to lots of generated rules.
This in turn has increased the size of the compiler,
made it hard to compile package ssa on small machines,
and provided a disincentive to mark some ops as commutative.
This change reworks how we handle commutative ops.
Instead of generating a rule per argument permutation,
we generate a series of nested loops, one for each commutative op.
Each loop tries both possible argument orderings.
I also considered attempting to canonicalize the inputs to the
rewrite rules. However, because either or both arguments might be
nothing more than an identifier, and because there can be arbitrary
conditions to evaluate during matching, I did not see how to proceed.
The duplicate rule detection now sorts arguments to commutative ops,
so that it can detect commutative-only duplicates.
There may be further optimizations to the new generated code.
In particular, we may not be removing as many bounds checks as before;
I have not investigated deeply. If more work here is needed,
we could do it with more hints or with improvements to the prove pass.
This change has almost no impact on the generated code.
It does not pass toolstash-check, however. In a handful of functions,
for reasons I do not understand, there are minor position changes.
For the entire series ending at this change,
there is negligible compiler performance impact.
The compiler binary shrinks by about 15%,
and package ssa shrinks by about 25%.
Package ssa also compiles ~25% faster with ~25% less memory.
Change-Id: Ia2ee9ceae7be08a17342319d4e31b0bb238a2ee4
Reviewed-on: https://go-review.googlesource.com/c/go/+/213703
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-06 22:24:02 -08:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ADDI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(val)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
cmd/compile: use loops to handle commutative ops in rules
Prior to this change, we generated additional rules at rulegen time
for all possible combinations of args to commutative ops.
This is simple and works well, but leads to lots of generated rules.
This in turn has increased the size of the compiler,
made it hard to compile package ssa on small machines,
and provided a disincentive to mark some ops as commutative.
This change reworks how we handle commutative ops.
Instead of generating a rule per argument permutation,
we generate a series of nested loops, one for each commutative op.
Each loop tries both possible argument orderings.
I also considered attempting to canonicalize the inputs to the
rewrite rules. However, because either or both arguments might be
nothing more than an identifier, and because there can be arbitrary
conditions to evaluate during matching, I did not see how to proceed.
The duplicate rule detection now sorts arguments to commutative ops,
so that it can detect commutative-only duplicates.
There may be further optimizations to the new generated code.
In particular, we may not be removing as many bounds checks as before;
I have not investigated deeply. If more work here is needed,
we could do it with more hints or with improvements to the prove pass.
This change has almost no impact on the generated code.
It does not pass toolstash-check, however. In a handful of functions,
for reasons I do not understand, there are minor position changes.
For the entire series ending at this change,
there is negligible compiler performance impact.
The compiler binary shrinks by about 15%,
and package ssa shrinks by about 25%.
Package ssa also compiles ~25% faster with ~25% less memory.
Change-Id: Ia2ee9ceae7be08a17342319d4e31b0bb238a2ee4
Reviewed-on: https://go-review.googlesource.com/c/go/+/213703
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-06 22:24:02 -08:00
|
|
|
return true
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
cmd/compile: use loops to handle commutative ops in rules
Prior to this change, we generated additional rules at rulegen time
for all possible combinations of args to commutative ops.
This is simple and works well, but leads to lots of generated rules.
This in turn has increased the size of the compiler,
made it hard to compile package ssa on small machines,
and provided a disincentive to mark some ops as commutative.
This change reworks how we handle commutative ops.
Instead of generating a rule per argument permutation,
we generate a series of nested loops, one for each commutative op.
Each loop tries both possible argument orderings.
I also considered attempting to canonicalize the inputs to the
rewrite rules. However, because either or both arguments might be
nothing more than an identifier, and because there can be arbitrary
conditions to evaluate during matching, I did not see how to proceed.
The duplicate rule detection now sorts arguments to commutative ops,
so that it can detect commutative-only duplicates.
There may be further optimizations to the new generated code.
In particular, we may not be removing as many bounds checks as before;
I have not investigated deeply. If more work here is needed,
we could do it with more hints or with improvements to the prove pass.
This change has almost no impact on the generated code.
It does not pass toolstash-check, however. In a handful of functions,
for reasons I do not understand, there are minor position changes.
For the entire series ending at this change,
there is negligible compiler performance impact.
The compiler binary shrinks by about 15%,
and package ssa shrinks by about 25%.
Package ssa also compiles ~25% faster with ~25% less memory.
Change-Id: Ia2ee9ceae7be08a17342319d4e31b0bb238a2ee4
Reviewed-on: https://go-review.googlesource.com/c/go/+/213703
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-06 22:24:02 -08:00
|
|
|
break
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64ADDI(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (ADDI [c] (MOVaddr [d] {s} x))
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(c+int64(d))
|
|
|
|
|
// result: (MOVaddr [int32(c)+d] {s} x)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
c := auxIntToInt64(v.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
d := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
s := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
x := v_0.Args[0]
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(c + int64(d))) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVaddr)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(int32(c) + d)
|
|
|
|
|
v.Aux = symToAux(s)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (ADDI [0] x)
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 0 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
2019-10-30 10:29:47 -07:00
|
|
|
v.copyOf(x)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
// match: (ADDI [x] (MOVDconst [y]))
|
|
|
|
|
// cond: is32Bit(x + y)
|
|
|
|
|
// result: (MOVDconst [x + y])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
if !(is32Bit(x + y)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(x + y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-04 05:23:07 +10:00
|
|
|
// match: (ADDI [x] (ADDI [y] z))
|
|
|
|
|
// cond: is32Bit(x + y)
|
|
|
|
|
// result: (ADDI [x + y] z)
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
z := v_0.Args[0]
|
|
|
|
|
if !(is32Bit(x + y)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ADDI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(x + y)
|
|
|
|
|
v.AddArg(z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
return false
|
|
|
|
|
}
|
2020-03-10 03:31:22 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64AND(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (AND (MOVDconst [val]) x)
|
|
|
|
|
// cond: is32Bit(val)
|
|
|
|
|
// result: (ANDI [val] x)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_0.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
x := v_1
|
|
|
|
|
if !(is32Bit(val)) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ANDI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(val)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64ANDI(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (ANDI [0] x)
|
|
|
|
|
// result: (MOVDconst [0])
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (ANDI [-1] x)
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != -1 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (ANDI [x] (MOVDconst [y]))
|
|
|
|
|
// result: (MOVDconst [x & y])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(x & y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-04 05:23:07 +10:00
|
|
|
// match: (ANDI [x] (ANDI [y] z))
|
|
|
|
|
// result: (ANDI [x & y] z)
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
z := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64ANDI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(x & y)
|
|
|
|
|
v.AddArg(z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return false
|
|
|
|
|
}
|
2023-06-26 20:46:49 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FADDD(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FADDD a (FMULD x y))
|
|
|
|
|
// cond: a.Block.Func.useFMA(v)
|
|
|
|
|
// result: (FMADDD x y a)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
a := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64FMULD {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
y := v_1.Args[1]
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
if !(a.Block.Func.useFMA(v)) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMADDD)
|
|
|
|
|
v.AddArg3(x, y, a)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-28 16:45:07 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FADDS(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FADDS a (FMULS x y))
|
|
|
|
|
// cond: a.Block.Func.useFMA(v)
|
|
|
|
|
// result: (FMADDS x y a)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
a := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64FMULS {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
y := v_1.Args[1]
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
if !(a.Block.Func.useFMA(v)) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMADDS)
|
|
|
|
|
v.AddArg3(x, y, a)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2021-02-17 15:00:34 +00:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FMADDD(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FMADDD neg:(FNEGD x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
// result: (FNMSUBD x y z)
|
2021-02-17 15:00:34 +00:00
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
v.reset(OpRISCV64FNMSUBD)
|
2021-02-17 15:00:34 +00:00
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FMADDD x y neg:(FNEGD z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FMSUBD x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMSUBD)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-28 16:45:07 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FMADDS(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FMADDS neg:(FNEGS x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FNMSUBS x y z)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMSUBS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FMADDS x y neg:(FNEGS z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FMSUBS x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMSUBS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2021-02-17 15:00:34 +00:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FMSUBD(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FMSUBD neg:(FNEGD x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
// result: (FNMADDD x y z)
|
2021-02-17 15:00:34 +00:00
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
v.reset(OpRISCV64FNMADDD)
|
2021-02-17 15:00:34 +00:00
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FMSUBD x y neg:(FNEGD z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FMADDD x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMADDD)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-28 16:45:07 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FMSUBS(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FMSUBS neg:(FNEGS x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FNMADDS x y z)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMADDS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FMSUBS x y neg:(FNEGS z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FMADDS x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMADDS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2021-02-17 15:00:34 +00:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FNMADDD(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FNMADDD neg:(FNEGD x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
// result: (FMSUBD x y z)
|
2021-02-17 15:00:34 +00:00
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
v.reset(OpRISCV64FMSUBD)
|
2021-02-17 15:00:34 +00:00
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FNMADDD x y neg:(FNEGD z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FNMSUBD x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMSUBD)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-28 16:45:07 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FNMADDS(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FNMADDS neg:(FNEGS x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FMSUBS x y z)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMSUBS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FNMADDS x y neg:(FNEGS z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FNMSUBS x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMSUBS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2021-02-17 15:00:34 +00:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FNMSUBD(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FNMSUBD neg:(FNEGD x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
// result: (FMADDD x y z)
|
2021-02-17 15:00:34 +00:00
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
cmd/compile: fix FMA negative commutativity of riscv64
According to RISCV manual 11.6:
FMADD x,y,z computes x*y+z and
FNMADD x,y,z => -x*y-z
FMSUB x,y,z => x*y-z
FNMSUB x,y,z => -x*y+z respectively
However our implement of SSA convert FMADD -x,y,z to FNMADD x,y,z which
is wrong and should be convert to FNMSUB according to manual.
Change-Id: Ib297bc83824e121fd7dda171ed56ea9694a4e575
Reviewed-on: https://go-review.googlesource.com/c/go/+/506575
Run-TryBot: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Joedian Reid <joedian@golang.org>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
2023-06-27 23:22:04 +08:00
|
|
|
v.reset(OpRISCV64FMADDD)
|
2021-02-17 15:00:34 +00:00
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FNMSUBD x y neg:(FNEGD z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FNMADDD x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMADDD)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-28 16:45:07 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FNMSUBS(v *Value) bool {
|
|
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FNMSUBS neg:(FNEGS x) y z)
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FMADDS x y z)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
neg := v_0
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
x := neg.Args[0]
|
|
|
|
|
y := v_1
|
|
|
|
|
z := v_2
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMADDS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// match: (FNMSUBS x y neg:(FNEGS z))
|
|
|
|
|
// cond: neg.Uses == 1
|
|
|
|
|
// result: (FNMADDS x y z)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
neg := v_2
|
|
|
|
|
if neg.Op != OpRISCV64FNEGS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
z := neg.Args[0]
|
|
|
|
|
if !(neg.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMADDS)
|
|
|
|
|
v.AddArg3(x, y, z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-26 20:46:49 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FSUBD(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FSUBD a (FMULD x y))
|
|
|
|
|
// cond: a.Block.Func.useFMA(v)
|
|
|
|
|
// result: (FNMSUBD x y a)
|
|
|
|
|
for {
|
|
|
|
|
a := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64FMULD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := v_1.Args[1]
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
if !(a.Block.Func.useFMA(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMSUBD)
|
|
|
|
|
v.AddArg3(x, y, a)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (FSUBD (FMULD x y) a)
|
|
|
|
|
// cond: a.Block.Func.useFMA(v)
|
|
|
|
|
// result: (FMSUBD x y a)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64FMULD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
a := v_1
|
|
|
|
|
if !(a.Block.Func.useFMA(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMSUBD)
|
|
|
|
|
v.AddArg3(x, y, a)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2023-06-28 16:45:07 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64FSUBS(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (FSUBS a (FMULS x y))
|
|
|
|
|
// cond: a.Block.Func.useFMA(v)
|
|
|
|
|
// result: (FNMSUBS x y a)
|
|
|
|
|
for {
|
|
|
|
|
a := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64FMULS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := v_1.Args[1]
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
if !(a.Block.Func.useFMA(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FNMSUBS)
|
|
|
|
|
v.AddArg3(x, y, a)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (FSUBS (FMULS x y) a)
|
|
|
|
|
// cond: a.Block.Func.useFMA(v)
|
|
|
|
|
// result: (FMSUBS x y a)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64FMULS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
a := v_1
|
|
|
|
|
if !(a.Block.Func.useFMA(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMSUBS)
|
|
|
|
|
v.AddArg3(x, y, a)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVBUload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVBUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVBUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBUload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVBUload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBUload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVBUreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
2022-08-28 02:30:58 +10:00
|
|
|
// match: (MOVBUreg x:(FLES _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FLES {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FLTS _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FLTS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FEQS _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FEQS {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FNES _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FNES {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FLED _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FLED {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FLTD _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FLTD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FEQD _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FEQD {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(FNED _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64FNED {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-05-10 10:49:33 -04:00
|
|
|
// match: (MOVBUreg x:(SEQZ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64SEQZ {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(SNEZ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64SNEZ {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(SLT _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64SLT {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(SLTU _ _))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64SLTU {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 02:29:12 +10:00
|
|
|
// match: (MOVBUreg x:(ANDI [c] y))
|
|
|
|
|
// cond: c >= 0 && int64(uint8(c)) == c
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(x.AuxInt)
|
|
|
|
|
if !(c >= 0 && int64(uint8(c)) == c) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-04 05:26:54 +10:00
|
|
|
// match: (MOVBUreg (ANDI [c] x))
|
|
|
|
|
// cond: c < 0
|
|
|
|
|
// result: (ANDI [int64(uint8(c))] x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
if !(c < 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ANDI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(uint8(c)))
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVBUreg (MOVDconst [c]))
|
2020-10-26 18:35:40 +11:00
|
|
|
// result: (MOVDconst [int64(uint8(c))])
|
|
|
|
|
for {
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
2020-10-26 18:35:40 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
2020-10-26 18:35:40 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(uint8(c)))
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVBUreg x:(MOVBUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-30 06:32:09 +10:00
|
|
|
// match: (MOVBUreg x:(Select0 (LoweredAtomicLoad8 _ _)))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpSelect0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x_0 := x.Args[0]
|
|
|
|
|
if x_0.Op != OpRISCV64LoweredAtomicLoad8 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(Select0 (LoweredAtomicCas32 _ _ _ _)))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpSelect0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x_0 := x.Args[0]
|
|
|
|
|
if x_0.Op != OpRISCV64LoweredAtomicCas32 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg x:(Select0 (LoweredAtomicCas64 _ _ _ _)))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpSelect0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x_0 := x.Args[0]
|
|
|
|
|
if x_0.Op != OpRISCV64LoweredAtomicCas64 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVBUreg x:(MOVBUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBUreg <t> x:(MOVBload [off] {sym} ptr mem))
|
|
|
|
|
// cond: x.Uses == 1 && clobber(x)
|
|
|
|
|
// result: @x.Block (MOVBUload <t> [off] {sym} ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
off := auxIntToInt32(x.AuxInt)
|
|
|
|
|
sym := auxToSym(x.Aux)
|
|
|
|
|
mem := x.Args[1]
|
|
|
|
|
ptr := x.Args[0]
|
|
|
|
|
if !(x.Uses == 1 && clobber(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b = x.Block
|
|
|
|
|
v0 := b.NewValue0(x.Pos, OpRISCV64MOVBUload, t)
|
|
|
|
|
v.copyOf(v0)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v0.Aux = symToAux(sym)
|
|
|
|
|
v0.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVBload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVBload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVBload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVBload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVBreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
2022-08-28 02:29:12 +10:00
|
|
|
// match: (MOVBreg x:(ANDI [c] y))
|
|
|
|
|
// cond: c >= 0 && int64(int8(c)) == c
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(x.AuxInt)
|
|
|
|
|
if !(c >= 0 && int64(int8(c)) == c) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVBreg (MOVDconst [c]))
|
2021-03-21 00:58:18 +11:00
|
|
|
// result: (MOVDconst [int64(int8(c))])
|
2020-10-26 18:35:40 +11:00
|
|
|
for {
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
2020-10-26 18:35:40 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
2020-10-26 18:35:40 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2021-03-21 00:58:18 +11:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(int8(c)))
|
2020-10-26 18:35:40 +11:00
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVBreg x:(MOVBload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBreg x:(MOVBreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBreg <t> x:(MOVBUload [off] {sym} ptr mem))
|
|
|
|
|
// cond: x.Uses == 1 && clobber(x)
|
|
|
|
|
// result: @x.Block (MOVBload <t> [off] {sym} ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
off := auxIntToInt32(x.AuxInt)
|
|
|
|
|
sym := auxToSym(x.Aux)
|
|
|
|
|
mem := x.Args[1]
|
|
|
|
|
ptr := x.Args[0]
|
|
|
|
|
if !(x.Uses == 1 && clobber(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b = x.Block
|
|
|
|
|
v0 := b.NewValue0(x.Pos, OpRISCV64MOVBload, t)
|
|
|
|
|
v.copyOf(v0)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v0.Aux = symToAux(sym)
|
|
|
|
|
v0.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVBstore(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVBstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstore [off1] {sym} (ADDI [off2] base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVBstore [off1+int32(off2)] {sym} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVDconst [0]) mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
// result: (MOVBstorezero [off] {sym} ptr mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst || auxIntToInt64(v_1.AuxInt) != 0 {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVBreg x) mem)
|
|
|
|
|
// result: (MOVBstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVBreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVHreg x) mem)
|
|
|
|
|
// result: (MOVBstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVHreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVWreg x) mem)
|
|
|
|
|
// result: (MOVBstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVWreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVBUreg x) mem)
|
|
|
|
|
// result: (MOVBstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVBUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVHUreg x) mem)
|
|
|
|
|
// result: (MOVBstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVHUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstore [off] {sym} ptr (MOVWUreg x) mem)
|
|
|
|
|
// result: (MOVBstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVWUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:26:54 +11:00
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVBstorezero(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (MOVBstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVBstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVBstorezero [off1] {sym} (ADDI [off2] ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVBstorezero [off1+int32(off2)] {sym} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVDload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVDload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVDload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVDload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-12-09 14:59:40 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVDnop(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (MOVDnop (MOVDconst [c]))
|
|
|
|
|
// result: (MOVDconst [c])
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(c)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVDreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (MOVDreg x)
|
|
|
|
|
// cond: x.Uses == 1
|
|
|
|
|
// result: (MOVDnop x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if !(x.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDnop)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVDstore(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVDstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVDstore [off1] {sym} (ADDI [off2] base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVDstore [off1+int32(off2)] {sym} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:26:54 +11:00
|
|
|
// match: (MOVDstore [off] {sym} ptr (MOVDconst [0]) mem)
|
|
|
|
|
// result: (MOVDstorezero [off] {sym} ptr mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0
|
2020-04-20 18:15:50 -04:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst || auxIntToInt64(v_1.AuxInt) != 0 {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVDstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVDstorezero(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (MOVDstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVDstorezero [off1] {sym} (ADDI [off2] ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVDstorezero [off1+int32(off2)] {sym} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVHUload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVHUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVHUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHUload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHUload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVHUload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHUload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVHUreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
2022-08-28 02:29:12 +10:00
|
|
|
// match: (MOVHUreg x:(ANDI [c] y))
|
|
|
|
|
// cond: c >= 0 && int64(uint16(c)) == c
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(x.AuxInt)
|
|
|
|
|
if !(c >= 0 && int64(uint16(c)) == c) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-04 05:26:54 +10:00
|
|
|
// match: (MOVHUreg (ANDI [c] x))
|
|
|
|
|
// cond: c < 0
|
|
|
|
|
// result: (ANDI [int64(uint16(c))] x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
if !(c < 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ANDI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(uint16(c)))
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVHUreg (MOVDconst [c]))
|
2020-10-26 18:35:40 +11:00
|
|
|
// result: (MOVDconst [int64(uint16(c))])
|
|
|
|
|
for {
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
2020-10-26 18:35:40 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
2020-10-26 18:35:40 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(uint16(c)))
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVHUreg x:(MOVBUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHUreg x:(MOVHUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHUreg x:(MOVBUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHUreg x:(MOVHUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHUreg <t> x:(MOVHload [off] {sym} ptr mem))
|
|
|
|
|
// cond: x.Uses == 1 && clobber(x)
|
|
|
|
|
// result: @x.Block (MOVHUload <t> [off] {sym} ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
off := auxIntToInt32(x.AuxInt)
|
|
|
|
|
sym := auxToSym(x.Aux)
|
|
|
|
|
mem := x.Args[1]
|
|
|
|
|
ptr := x.Args[0]
|
|
|
|
|
if !(x.Uses == 1 && clobber(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b = x.Block
|
|
|
|
|
v0 := b.NewValue0(x.Pos, OpRISCV64MOVHUload, t)
|
|
|
|
|
v.copyOf(v0)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v0.Aux = symToAux(sym)
|
|
|
|
|
v0.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVHload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVHload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVHload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVHload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVHreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
2022-08-28 02:29:12 +10:00
|
|
|
// match: (MOVHreg x:(ANDI [c] y))
|
|
|
|
|
// cond: c >= 0 && int64(int16(c)) == c
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(x.AuxInt)
|
|
|
|
|
if !(c >= 0 && int64(int16(c)) == c) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVHreg (MOVDconst [c]))
|
2021-03-21 00:58:18 +11:00
|
|
|
// result: (MOVDconst [int64(int16(c))])
|
2020-10-26 18:35:40 +11:00
|
|
|
for {
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
2020-10-26 18:35:40 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
2020-10-26 18:35:40 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2021-03-21 00:58:18 +11:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(int16(c)))
|
2020-10-26 18:35:40 +11:00
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVHreg x:(MOVBload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHreg x:(MOVBUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHreg x:(MOVHload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHreg x:(MOVBreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHreg x:(MOVBUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHreg x:(MOVHreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHreg <t> x:(MOVHUload [off] {sym} ptr mem))
|
|
|
|
|
// cond: x.Uses == 1 && clobber(x)
|
|
|
|
|
// result: @x.Block (MOVHload <t> [off] {sym} ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
off := auxIntToInt32(x.AuxInt)
|
|
|
|
|
sym := auxToSym(x.Aux)
|
|
|
|
|
mem := x.Args[1]
|
|
|
|
|
ptr := x.Args[0]
|
|
|
|
|
if !(x.Uses == 1 && clobber(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b = x.Block
|
|
|
|
|
v0 := b.NewValue0(x.Pos, OpRISCV64MOVHload, t)
|
|
|
|
|
v.copyOf(v0)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v0.Aux = symToAux(sym)
|
|
|
|
|
v0.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVHstore(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVHstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVHstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHstore [off1] {sym} (ADDI [off2] base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVHstore [off1+int32(off2)] {sym} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVHstore [off] {sym} ptr (MOVDconst [0]) mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
// result: (MOVHstorezero [off] {sym} ptr mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst || auxIntToInt64(v_1.AuxInt) != 0 {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVHstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVHstore [off] {sym} ptr (MOVHreg x) mem)
|
|
|
|
|
// result: (MOVHstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVHreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHstore [off] {sym} ptr (MOVWreg x) mem)
|
|
|
|
|
// result: (MOVHstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVWreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHstore [off] {sym} ptr (MOVHUreg x) mem)
|
|
|
|
|
// result: (MOVHstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVHUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHstore [off] {sym} ptr (MOVWUreg x) mem)
|
|
|
|
|
// result: (MOVHstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVWUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:26:54 +11:00
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVHstorezero(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (MOVHstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVHstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVHstorezero [off1] {sym} (ADDI [off2] ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVHstorezero [off1+int32(off2)] {sym} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVWUload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVWUload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVWUload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWUload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVWUload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWUload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVWUreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
2022-09-04 05:26:54 +10:00
|
|
|
typ := &b.Func.Config.Types
|
2022-08-28 02:29:12 +10:00
|
|
|
// match: (MOVWUreg x:(ANDI [c] y))
|
|
|
|
|
// cond: c >= 0 && int64(uint32(c)) == c
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(x.AuxInt)
|
|
|
|
|
if !(c >= 0 && int64(uint32(c)) == c) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-04 05:26:54 +10:00
|
|
|
// match: (MOVWUreg (ANDI [c] x))
|
|
|
|
|
// cond: c < 0
|
|
|
|
|
// result: (AND (MOVDconst [int64(uint32(c))]) x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
if !(c < 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(int64(uint32(c)))
|
|
|
|
|
v.AddArg2(v0, x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVWUreg (MOVDconst [c]))
|
2020-10-26 18:35:40 +11:00
|
|
|
// result: (MOVDconst [int64(uint32(c))])
|
|
|
|
|
for {
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
2020-10-26 18:35:40 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
2020-10-26 18:35:40 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(uint32(c)))
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVWUreg x:(MOVBUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUreg x:(MOVHUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUreg x:(MOVWUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVWUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUreg x:(MOVBUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUreg x:(MOVHUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUreg x:(MOVWUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVWUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWUreg <t> x:(MOVWload [off] {sym} ptr mem))
|
|
|
|
|
// cond: x.Uses == 1 && clobber(x)
|
|
|
|
|
// result: @x.Block (MOVWUload <t> [off] {sym} ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVWload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
off := auxIntToInt32(x.AuxInt)
|
|
|
|
|
sym := auxToSym(x.Aux)
|
|
|
|
|
mem := x.Args[1]
|
|
|
|
|
ptr := x.Args[0]
|
|
|
|
|
if !(x.Uses == 1 && clobber(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b = x.Block
|
|
|
|
|
v0 := b.NewValue0(x.Pos, OpRISCV64MOVWUload, t)
|
|
|
|
|
v.copyOf(v0)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v0.Aux = symToAux(sym)
|
|
|
|
|
v0.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVWload(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVWload [off1] {sym1} (MOVaddr [off2] {sym2} base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVWload [off1+off2] {mergeSym(sym1,sym2)} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWload [off1] {sym} (ADDI [off2] base) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVWload [off1+int32(off2)] {sym} base mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWload)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(base, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVWreg(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
b := v.Block
|
2022-08-28 02:29:12 +10:00
|
|
|
// match: (MOVWreg x:(ANDI [c] y))
|
|
|
|
|
// cond: c >= 0 && int64(int32(c)) == c
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := auxIntToInt64(x.AuxInt)
|
|
|
|
|
if !(c >= 0 && int64(int32(c)) == c) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVWreg (MOVDconst [c]))
|
2021-03-21 00:58:18 +11:00
|
|
|
// result: (MOVDconst [int64(int32(c))])
|
2020-10-26 18:35:40 +11:00
|
|
|
for {
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
2020-10-26 18:35:40 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_0.AuxInt)
|
2020-10-26 18:35:40 +11:00
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2021-03-21 00:58:18 +11:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(int32(c)))
|
2020-10-26 18:35:40 +11:00
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVWreg x:(MOVBload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVBUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVHload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVHUload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVWload _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVWload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-07 17:40:11 +08:00
|
|
|
// match: (MOVWreg x:(ADDIW _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64ADDIW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(SUBW _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64SUBW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(NEGW _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64NEGW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MULW _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MULW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(DIVW _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64DIVW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(DIVUW _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64DIVUW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(REMW _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64REMW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(REMUW _ _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64REMUW {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVWreg x:(MOVBreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVBUreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVBUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVHreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVHreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg x:(MOVWreg _))
|
|
|
|
|
// result: (MOVDreg x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVWreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDreg)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWreg <t> x:(MOVWUload [off] {sym} ptr mem))
|
|
|
|
|
// cond: x.Uses == 1 && clobber(x)
|
|
|
|
|
// result: @x.Block (MOVWload <t> [off] {sym} ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
x := v_0
|
|
|
|
|
if x.Op != OpRISCV64MOVWUload {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
off := auxIntToInt32(x.AuxInt)
|
|
|
|
|
sym := auxToSym(x.Aux)
|
|
|
|
|
mem := x.Args[1]
|
|
|
|
|
ptr := x.Args[0]
|
|
|
|
|
if !(x.Uses == 1 && clobber(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b = x.Block
|
|
|
|
|
v0 := b.NewValue0(x.Pos, OpRISCV64MOVWload, t)
|
|
|
|
|
v.copyOf(v0)
|
|
|
|
|
v0.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v0.Aux = symToAux(sym)
|
|
|
|
|
v0.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVWstore(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (MOVWstore [off1] {sym1} (MOVaddr [off2] {sym2} base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWstore [off1] {sym} (ADDI [off2] base) val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVWstore [off1+int32(off2)] {sym} base val mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
base := v_0.Args[0]
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(base, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (MOVWstore [off] {sym} ptr (MOVDconst [0]) mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
// result: (MOVWstorezero [off] {sym} ptr mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst || auxIntToInt64(v_1.AuxInt) != 0 {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVWstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: eliminate unnecessary sign/zero extension for riscv64
Add additional rules to eliminate unnecessary sign/zero extension for riscv64.
Also where possible, replace an extension following a load with a different typed
load. This removes almost another 8,000 instructions from the go binary.
Of particular note, change Eq16/Eq8/Neq16/Neq8 to zero extend each value before
subtraction, rather than zero extending after subtraction. While this appears to
double the number of zero extensions, it often lets us completely eliminate them
as the load can already be performed in a properly typed manner.
As an example, prior to this change runtime.memequal16 was:
0000000000013028 <runtime.memequal16>:
13028: 00813183 ld gp,8(sp)
1302c: 00019183 lh gp,0(gp)
13030: 01013283 ld t0,16(sp)
13034: 00029283 lh t0,0(t0)
13038: 405181b3 sub gp,gp,t0
1303c: 03019193 slli gp,gp,0x30
13040: 0301d193 srli gp,gp,0x30
13044: 0011b193 seqz gp,gp
13048: 00310c23 sb gp,24(sp)
1304c: 00008067 ret
Whereas it now becomes:
0000000000012fa8 <runtime.memequal16>:
12fa8: 00813183 ld gp,8(sp)
12fac: 0001d183 lhu gp,0(gp)
12fb0: 01013283 ld t0,16(sp)
12fb4: 0002d283 lhu t0,0(t0)
12fb8: 405181b3 sub gp,gp,t0
12fbc: 0011b193 seqz gp,gp
12fc0: 00310c23 sb gp,24(sp)
12fc4: 00008067 ret
Change-Id: I16321feb18381241cab121c0097a126104c56c2c
Reviewed-on: https://go-review.googlesource.com/c/go/+/264659
Trust: Joel Sing <joel@sing.id.au>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2020-10-25 01:34:17 +11:00
|
|
|
// match: (MOVWstore [off] {sym} ptr (MOVWreg x) mem)
|
|
|
|
|
// result: (MOVWstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVWreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWstore [off] {sym} ptr (MOVWUreg x) mem)
|
|
|
|
|
// result: (MOVWstore [off] {sym} ptr x mem)
|
|
|
|
|
for {
|
|
|
|
|
off := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVWUreg {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1.Args[0]
|
|
|
|
|
mem := v_2
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(off)
|
|
|
|
|
v.Aux = symToAux(sym)
|
|
|
|
|
v.AddArg3(ptr, x, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:26:54 +11:00
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64MOVWstorezero(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (MOVWstorezero [off1] {sym1} (MOVaddr [off2] {sym2} ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: canMergeSym(sym1,sym2) && is32Bit(int64(off1)+int64(off2))
|
2020-10-28 10:10:55 +01:00
|
|
|
// result: (MOVWstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym1 := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64MOVaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt32(v_0.AuxInt)
|
|
|
|
|
sym2 := auxToSym(v_0.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(canMergeSym(sym1, sym2) && is32Bit(int64(off1)+int64(off2))) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + off2)
|
2020-10-28 10:10:55 +01:00
|
|
|
v.Aux = symToAux(mergeSym(sym1, sym2))
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (MOVWstorezero [off1] {sym} (ADDI [off2] ptr) mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: is32Bit(int64(off1)+off2)
|
|
|
|
|
// result: (MOVWstorezero [off1+int32(off2)] {sym} ptr mem)
|
2020-03-02 04:26:54 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
off1 := auxIntToInt32(v.AuxInt)
|
|
|
|
|
sym := auxToSym(v.Aux)
|
2020-03-02 04:26:54 +11:00
|
|
|
if v_0.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
off2 := auxIntToInt64(v_0.AuxInt)
|
2020-03-02 04:26:54 +11:00
|
|
|
ptr := v_0.Args[0]
|
|
|
|
|
mem := v_1
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(is32Bit(int64(off1) + off2)) {
|
2020-03-02 04:26:54 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstorezero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int32ToAuxInt(off1 + int32(off2))
|
|
|
|
|
v.Aux = symToAux(sym)
|
2020-03-02 04:26:54 +11:00
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64NEG(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
2022-09-09 23:15:46 +10:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (NEG (SUB x y))
|
|
|
|
|
// result: (SUB y x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64SUB {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SUB)
|
|
|
|
|
v.AddArg2(y, x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (NEG <t> s:(ADDI [val] (SUB x y)))
|
|
|
|
|
// cond: s.Uses == 1 && is32Bit(-val)
|
|
|
|
|
// result: (ADDI [-val] (SUB <t> y x))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
s := v_0
|
|
|
|
|
if s.Op != OpRISCV64ADDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
val := auxIntToInt64(s.AuxInt)
|
|
|
|
|
s_0 := s.Args[0]
|
|
|
|
|
if s_0.Op != OpRISCV64SUB {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := s_0.Args[1]
|
|
|
|
|
x := s_0.Args[0]
|
|
|
|
|
if !(s.Uses == 1 && is32Bit(-val)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ADDI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(-val)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, t)
|
|
|
|
|
v0.AddArg2(y, x)
|
|
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (NEG (NEG x))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64NEG {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
// match: (NEG (MOVDconst [x]))
|
|
|
|
|
// result: (MOVDconst [-x])
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(-x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64NEGW(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (NEGW (MOVDconst [x]))
|
|
|
|
|
// result: (MOVDconst [int64(int32(-x))])
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(int64(int32(-x)))
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-03-10 03:31:22 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64OR(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (OR (MOVDconst [val]) x)
|
|
|
|
|
// cond: is32Bit(val)
|
|
|
|
|
// result: (ORI [val] x)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_0.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
x := v_1
|
|
|
|
|
if !(is32Bit(val)) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ORI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(val)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64ORI(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (ORI [0] x)
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (ORI [-1] x)
|
|
|
|
|
// result: (MOVDconst [-1])
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != -1 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(-1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (ORI [x] (MOVDconst [y]))
|
|
|
|
|
// result: (MOVDconst [x | y])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(x | y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-04 05:23:07 +10:00
|
|
|
// match: (ORI [x] (ORI [y] z))
|
|
|
|
|
// result: (ORI [x | y] z)
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ORI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
z := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64ORI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(x | y)
|
|
|
|
|
v.AddArg(z)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return false
|
|
|
|
|
}
|
2022-08-28 06:08:02 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SEQZ(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SEQZ (NEG x))
|
|
|
|
|
// result: (SEQZ x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64NEG {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SEQZ)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 06:11:43 +10:00
|
|
|
// match: (SEQZ (SEQZ x))
|
|
|
|
|
// result: (SNEZ x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64SEQZ {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SNEZ)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (SEQZ (SNEZ x))
|
|
|
|
|
// result: (SEQZ x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64SNEZ {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SEQZ)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 06:08:02 +10:00
|
|
|
return false
|
|
|
|
|
}
|
2020-03-10 03:31:22 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SLL(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SLL x (MOVDconst [val]))
|
2020-04-20 18:15:50 -04:00
|
|
|
// result: (SLLI [int64(val&63)] x)
|
2020-03-10 03:31:22 +11:00
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_1.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.reset(OpRISCV64SLLI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(val & 63))
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SLLI(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SLLI [x] (MOVDconst [y]))
|
2022-06-02 05:09:09 +10:00
|
|
|
// cond: is32Bit(y << uint32(x))
|
|
|
|
|
// result: (MOVDconst [y << uint32(x)])
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
2022-06-02 05:09:09 +10:00
|
|
|
if !(is32Bit(y << uint32(x))) {
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2022-06-02 05:09:09 +10:00
|
|
|
v.AuxInt = int64ToAuxInt(y << uint32(x))
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2022-07-29 14:24:26 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SLT(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2022-09-05 03:03:35 +10:00
|
|
|
// match: (SLT x (MOVDconst [val]))
|
|
|
|
|
// cond: val >= -2048 && val <= 2047
|
|
|
|
|
// result: (SLTI [val] x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
val := auxIntToInt64(v_1.AuxInt)
|
|
|
|
|
if !(val >= -2048 && val <= 2047) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLTI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(val)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-07-29 14:24:26 +08:00
|
|
|
// match: (SLT x x)
|
|
|
|
|
// result: (MOVDconst [0])
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x != v_1 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SLTI(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SLTI [x] (MOVDconst [y]))
|
|
|
|
|
// result: (MOVDconst [b2i(int64(y) < int64(x))])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(b2i(int64(y) < int64(x)))
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-01 20:36:34 +10:00
|
|
|
// match: (SLTI [x] (ANDI [y] _))
|
|
|
|
|
// cond: y >= 0 && int64(y) < int64(x)
|
|
|
|
|
// result: (MOVDconst [1])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
if !(y >= 0 && int64(y) < int64(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (SLTI [x] (ORI [y] _))
|
|
|
|
|
// cond: y >= 0 && int64(y) >= int64(x)
|
|
|
|
|
// result: (MOVDconst [0])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ORI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
if !(y >= 0 && int64(y) >= int64(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64SLTIU(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SLTIU [x] (MOVDconst [y]))
|
|
|
|
|
// result: (MOVDconst [b2i(uint64(y) < uint64(x))])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(b2i(uint64(y) < uint64(x)))
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-01 20:36:34 +10:00
|
|
|
// match: (SLTIU [x] (ANDI [y] _))
|
|
|
|
|
// cond: y >= 0 && uint64(y) < uint64(x)
|
|
|
|
|
// result: (MOVDconst [1])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ANDI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
if !(y >= 0 && uint64(y) < uint64(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (SLTIU [x] (ORI [y] _))
|
|
|
|
|
// cond: y >= 0 && uint64(y) >= uint64(x)
|
|
|
|
|
// result: (MOVDconst [0])
|
|
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64ORI {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
if !(y >= 0 && uint64(y) >= uint64(x)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return false
|
|
|
|
|
}
|
2022-07-29 14:24:26 +08:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SLTU(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2022-09-05 03:03:35 +10:00
|
|
|
// match: (SLTU x (MOVDconst [val]))
|
|
|
|
|
// cond: val >= -2048 && val <= 2047
|
|
|
|
|
// result: (SLTIU [val] x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
val := auxIntToInt64(v_1.AuxInt)
|
|
|
|
|
if !(val >= -2048 && val <= 2047) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SLTIU)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(val)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-07-29 14:24:26 +08:00
|
|
|
// match: (SLTU x x)
|
|
|
|
|
// result: (MOVDconst [0])
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if x != v_1 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2022-08-28 06:08:02 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SNEZ(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SNEZ (NEG x))
|
|
|
|
|
// result: (SNEZ x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64NEG {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SNEZ)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 06:11:43 +10:00
|
|
|
// match: (SNEZ (SEQZ x))
|
|
|
|
|
// result: (SEQZ x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64SEQZ {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SEQZ)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (SNEZ (SNEZ x))
|
|
|
|
|
// result: (SNEZ x)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpRISCV64SNEZ {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
v.reset(OpRISCV64SNEZ)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 06:08:02 +10:00
|
|
|
return false
|
|
|
|
|
}
|
2020-03-10 03:31:22 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SRA(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SRA x (MOVDconst [val]))
|
2020-04-20 18:15:50 -04:00
|
|
|
// result: (SRAI [int64(val&63)] x)
|
2020-03-10 03:31:22 +11:00
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_1.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.reset(OpRISCV64SRAI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(val & 63))
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SRAI(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SRAI [x] (MOVDconst [y]))
|
2022-06-02 05:09:09 +10:00
|
|
|
// result: (MOVDconst [int64(y) >> uint32(x)])
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2022-06-02 05:09:09 +10:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(y) >> uint32(x))
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-03-10 03:31:22 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SRL(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SRL x (MOVDconst [val]))
|
2020-04-20 18:15:50 -04:00
|
|
|
// result: (SRLI [int64(val&63)] x)
|
2020-03-10 03:31:22 +11:00
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_1.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.reset(OpRISCV64SRLI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(val & 63))
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SRLI(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (SRLI [x] (MOVDconst [y]))
|
2022-06-02 05:09:09 +10:00
|
|
|
// result: (MOVDconst [int64(uint64(y) >> uint32(x))])
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
for {
|
|
|
|
|
x := auxIntToInt64(v.AuxInt)
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
v.reset(OpRISCV64MOVDconst)
|
2022-06-02 05:09:09 +10:00
|
|
|
v.AuxInt = int64ToAuxInt(int64(uint64(y) >> uint32(x)))
|
cmd/compile: optimise immediate operands with constants on riscv64
Instructions with immediates can be precomputed when operating on a
constant - do so for SLTI/SLTIU, SLLI/SRLI/SRAI, NEG/NEGW, ANDI, ORI
and ADDI. Additionally, optimise ANDI and ORI when the immediate is
all ones or all zeroes.
In particular, the RISCV64 logical left and right shift rules
(Lsh*x*/Rsh*Ux*) produce sequences that check if the shift amount
exceeds 64 and if so returns zero. When the shift amount is a
constant we can precompute and eliminate the filter entirely.
Likewise the arithmetic right shift rules produce sequences that
check if the shift amount exceeds 64 and if so, ensures that the
lower six bits of the shift are all ones. When the shift amount
is a constant we can precompute the shift value.
Arithmetic right shift sequences like:
117fc: 00100513 li a0,1
11800: 04053593 sltiu a1,a0,64
11804: fff58593 addi a1,a1,-1
11808: 0015e593 ori a1,a1,1
1180c: 40b45433 sra s0,s0,a1
Are now a single srai instruction:
117fc: 40145413 srai s0,s0,0x1
Likewise for logical left shift (and logical right shift):
1d560: 01100413 li s0,17
1d564: 04043413 sltiu s0,s0,64
1d568: 40800433 neg s0,s0
1d56c: 01131493 slli s1,t1,0x11
1d570: 0084f433 and s0,s1,s0
Which are now a single slli (or srli) instruction:
1d120: 01131413 slli s0,t1,0x11
This removes more than 30,000 instructions from the Go binary and
should improve performance in a variety of areas - of note
runtime.makemap_small drops from 48 to 36 instructions. Similar
gains exist in at least other parts of runtime and math/bits.
Change-Id: I33f6f3d1fd36d9ff1bda706997162bfe4bb859b6
Reviewed-on: https://go-review.googlesource.com/c/go/+/350689
Trust: Joel Sing <joel@sing.id.au>
Reviewed-by: Michael Munday <mike.munday@lowrisc.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-17 16:53:11 +10:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-03-02 04:23:12 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64SUB(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
cmd/compile: optimise subtraction with const on riscv64
Convert subtraction from const to a negated ADDI with negative const
value, where possible. At worst this avoids a register load and uses
the same number of instructions. At best, this allows for further
optimisation to occur, particularly where equality is involved.
For example, this sequence:
li t0,-1
sub t1,t0,a0
snez t1,t1
Becomes:
addi t0,a0,1
snez t0,t0
Removes more than 2000 instructions from the Go binary on linux/riscv64.
Change-Id: I68f3be897bc645d4a8fa3ab3cef165a00a74df19
Reviewed-on: https://go-review.googlesource.com/c/go/+/426263
Reviewed-by: Meng Zhuo <mzh@golangcn.org>
Reviewed-by: Heschi Kreinick <heschi@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-28 05:23:28 +10:00
|
|
|
b := v.Block
|
2020-03-02 04:23:12 +11:00
|
|
|
// match: (SUB x (MOVDconst [val]))
|
|
|
|
|
// cond: is32Bit(-val)
|
|
|
|
|
// result: (ADDI [-val] x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_1.AuxInt)
|
2020-03-02 04:23:12 +11:00
|
|
|
if !(is32Bit(-val)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ADDI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(-val)
|
2020-03-02 04:23:12 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
cmd/compile: optimise subtraction with const on riscv64
Convert subtraction from const to a negated ADDI with negative const
value, where possible. At worst this avoids a register load and uses
the same number of instructions. At best, this allows for further
optimisation to occur, particularly where equality is involved.
For example, this sequence:
li t0,-1
sub t1,t0,a0
snez t1,t1
Becomes:
addi t0,a0,1
snez t0,t0
Removes more than 2000 instructions from the Go binary on linux/riscv64.
Change-Id: I68f3be897bc645d4a8fa3ab3cef165a00a74df19
Reviewed-on: https://go-review.googlesource.com/c/go/+/426263
Reviewed-by: Meng Zhuo <mzh@golangcn.org>
Reviewed-by: Heschi Kreinick <heschi@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Joel Sing <joel@sing.id.au>
2022-08-28 05:23:28 +10:00
|
|
|
// match: (SUB <t> (MOVDconst [val]) y)
|
|
|
|
|
// cond: is32Bit(-val)
|
|
|
|
|
// result: (NEG (ADDI <t> [-val] y))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
val := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(is32Bit(-val)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64NEG)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ADDI, t)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(-val)
|
|
|
|
|
v0.AddArg(y)
|
|
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:24:35 +11:00
|
|
|
// match: (SUB x (MOVDconst [0]))
|
|
|
|
|
// result: x
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
2020-04-20 18:15:50 -04:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst || auxIntToInt64(v_1.AuxInt) != 0 {
|
2020-03-02 04:24:35 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.copyOf(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-03 03:45:22 +11:00
|
|
|
// match: (SUB (MOVDconst [0]) x)
|
|
|
|
|
// result: (NEG x)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst || auxIntToInt64(v_0.AuxInt) != 0 {
|
2020-03-03 03:45:22 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1
|
|
|
|
|
v.reset(OpRISCV64NEG)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:24:35 +11:00
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRISCV64SUBW(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (SUBW x (MOVDconst [0]))
|
2020-03-02 04:24:35 +11:00
|
|
|
// result: (ADDIW [0] x)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst || auxIntToInt64(v_1.AuxInt) != 0 {
|
2020-03-02 04:24:35 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64ADDIW)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(0)
|
2020-03-02 04:24:35 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-03 03:45:22 +11:00
|
|
|
// match: (SUBW (MOVDconst [0]) x)
|
|
|
|
|
// result: (NEGW x)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if v_0.Op != OpRISCV64MOVDconst || auxIntToInt64(v_0.AuxInt) != 0 {
|
2020-03-03 03:45:22 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
x := v_1
|
|
|
|
|
v.reset(OpRISCV64NEGW)
|
|
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-02 04:23:12 +11:00
|
|
|
return false
|
|
|
|
|
}
|
2020-03-10 03:31:22 +11:00
|
|
|
func rewriteValueRISCV64_OpRISCV64XOR(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
|
|
|
|
// match: (XOR (MOVDconst [val]) x)
|
|
|
|
|
// cond: is32Bit(val)
|
|
|
|
|
// result: (XORI [val] x)
|
|
|
|
|
for {
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
|
|
|
|
if v_0.Op != OpRISCV64MOVDconst {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
val := auxIntToInt64(v_0.AuxInt)
|
2020-03-10 03:31:22 +11:00
|
|
|
x := v_1
|
|
|
|
|
if !(is32Bit(val)) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64XORI)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(val)
|
2020-03-10 03:31:22 +11:00
|
|
|
v.AddArg(x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRotateLeft16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (RotateLeft16 <t> x (MOVDconst [c]))
|
|
|
|
|
// result: (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_1.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpOr16)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
|
2021-03-18 03:37:58 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v1.AuxInt = int64ToAuxInt(c & 15)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
|
2021-03-18 03:37:58 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v3.AuxInt = int64ToAuxInt(-c & 15)
|
2020-02-26 11:29:34 -08:00
|
|
|
v2.AddArg2(x, v3)
|
|
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRotateLeft32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (RotateLeft32 <t> x (MOVDconst [c]))
|
|
|
|
|
// result: (Or32 (Lsh32x64 <t> x (MOVDconst [c&31])) (Rsh32Ux64 <t> x (MOVDconst [-c&31])))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_1.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpOr32)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLsh32x64, t)
|
2021-03-18 03:37:58 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v1.AuxInt = int64ToAuxInt(c & 31)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRsh32Ux64, t)
|
2021-03-18 03:37:58 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v3.AuxInt = int64ToAuxInt(-c & 31)
|
2020-02-26 11:29:34 -08:00
|
|
|
v2.AddArg2(x, v3)
|
|
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRotateLeft64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (RotateLeft64 <t> x (MOVDconst [c]))
|
|
|
|
|
// result: (Or64 (Lsh64x64 <t> x (MOVDconst [c&63])) (Rsh64Ux64 <t> x (MOVDconst [-c&63])))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
2019-11-04 04:40:47 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
|
|
|
|
break
|
|
|
|
|
}
|
2020-04-20 18:15:50 -04:00
|
|
|
c := auxIntToInt64(v_1.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpOr64)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLsh64x64, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(c & 63)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRsh64Ux64, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(-c & 63)
|
2020-02-26 11:29:34 -08:00
|
|
|
v2.AddArg2(x, v3)
|
|
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRotateLeft8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
2021-03-18 03:37:58 +11:00
|
|
|
// match: (RotateLeft8 <t> x (MOVDconst [c]))
|
|
|
|
|
// result: (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
2021-03-18 03:37:58 +11:00
|
|
|
if v_1.Op != OpRISCV64MOVDconst {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2021-03-18 03:37:58 +11:00
|
|
|
c := auxIntToInt64(v_1.AuxInt)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpOr8)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
|
2021-03-18 03:37:58 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v1.AuxInt = int64ToAuxInt(c & 7)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
|
2021-03-18 03:37:58 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v3.AuxInt = int64ToAuxInt(-c & 7)
|
2020-02-26 11:29:34 -08:00
|
|
|
v2.AddArg2(x, v3)
|
|
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16Ux16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16Ux16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16Ux16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16Ux32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16Ux32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16Ux32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16Ux64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16Ux64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16Ux64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16Ux8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16Ux8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt16to64 x) y) (Neg16 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg16, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16Ux8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh16x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpRsh16x8(v *Value) bool {
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh16x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt16to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh16x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt16to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32Ux16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32Ux16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32Ux16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32Ux32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32Ux32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32Ux32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32Ux64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32Ux64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32Ux64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32Ux8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32Ux8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt32to64 x) y) (Neg32 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg32, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32Ux8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh32x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh32x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt32to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh32x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt32to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64Ux16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh64Ux16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64Ux16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64Ux32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh64Ux32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64Ux32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64Ux64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Rsh64Ux64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64Ux64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64Ux8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh64Ux8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> x y) (Neg64 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(x, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v1 := b.NewValue0(v.Pos, OpNeg64, t)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64Ux8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh64x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, v1)
|
|
|
|
|
v.AddArg2(x, v0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh64x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, v1)
|
|
|
|
|
v.AddArg2(x, v0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Rsh64x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2.AddArg(y)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, v1)
|
|
|
|
|
v.AddArg2(x, v0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh64x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh64x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> x (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
|
|
|
|
v1.AddArg(v2)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(y, v1)
|
|
|
|
|
v.AddArg2(x, v0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh64x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA x y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8Ux16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8Ux16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt16to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8Ux16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8Ux32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8Ux32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt32to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8Ux32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8Ux64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8Ux64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] y)))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8Ux64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8Ux8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8Ux8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (AND (SRL <t> (ZeroExt8to64 x) y) (Neg8 <t> (SLTIU <t> [64] (ZeroExt8to64 y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64AND)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SRL, t)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v1.AddArg(x)
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(v1, y)
|
2019-11-04 04:40:47 +11:00
|
|
|
v2 := b.NewValue0(v.Pos, OpNeg8, t)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, t)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg2(v0, v2)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8Ux8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRL (ZeroExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRL)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8x16(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8x16 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt16to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8x16 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8x32(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8x32 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt32to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8x32 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8x64(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8x64 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] y))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3.AddArg(y)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8x64 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpRsh8x8(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Rsh8x8 <t> x y)
|
2022-08-11 00:07:40 +08:00
|
|
|
// cond: !shiftIsBounded(v)
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (SRA <t> (SignExt8to64 x) (OR <y.Type> y (ADDI <y.Type> [-1] (SLTIU <y.Type> [64] (ZeroExt8to64 y)))))
|
|
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
2022-08-11 00:07:40 +08:00
|
|
|
if !(!shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v.Type = t
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64OR, y.Type)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64ADDI, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v2.AuxInt = int64ToAuxInt(-1)
|
2019-11-04 04:40:47 +11:00
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SLTIU, y.Type)
|
2020-04-20 18:15:50 -04:00
|
|
|
v3.AuxInt = int64ToAuxInt(64)
|
2019-11-04 04:40:47 +11:00
|
|
|
v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
|
|
|
|
|
v4.AddArg(y)
|
|
|
|
|
v3.AddArg(v4)
|
|
|
|
|
v2.AddArg(v3)
|
2020-02-26 11:29:34 -08:00
|
|
|
v1.AddArg2(y, v2)
|
|
|
|
|
v.AddArg2(v0, v1)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-11 00:07:40 +08:00
|
|
|
// match: (Rsh8x8 x y)
|
|
|
|
|
// cond: shiftIsBounded(v)
|
|
|
|
|
// result: (SRA (SignExt8to64 x) y)
|
|
|
|
|
for {
|
|
|
|
|
x := v_0
|
|
|
|
|
y := v_1
|
|
|
|
|
if !(shiftIsBounded(v)) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64SRA)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
|
|
|
|
|
v0.AddArg(x)
|
|
|
|
|
v.AddArg2(v0, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
2019-11-04 04:40:47 +11:00
|
|
|
}
|
2022-08-24 22:17:51 +08:00
|
|
|
func rewriteValueRISCV64_OpSelect0(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
2022-07-29 14:24:26 +08:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Select0 (Add64carry x y c))
|
|
|
|
|
// result: (ADD (ADD <typ.UInt64> x y) c)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpAdd64carry {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := v_0.Args[2]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
v.reset(OpRISCV64ADD)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ADD, typ.UInt64)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
v.AddArg2(v0, c)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-07-29 22:14:53 +08:00
|
|
|
// match: (Select0 (Sub64borrow x y c))
|
|
|
|
|
// result: (SUB (SUB <typ.UInt64> x y) c)
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpSub64borrow {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := v_0.Args[2]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
v.reset(OpRISCV64SUB)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.UInt64)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
v.AddArg2(v0, c)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-24 22:17:51 +08:00
|
|
|
// match: (Select0 m:(LoweredMuluhilo x y))
|
|
|
|
|
// cond: m.Uses == 1
|
|
|
|
|
// result: (MULHU x y)
|
|
|
|
|
for {
|
|
|
|
|
m := v_0
|
|
|
|
|
if m.Op != OpRISCV64LoweredMuluhilo {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := m.Args[1]
|
|
|
|
|
x := m.Args[0]
|
|
|
|
|
if !(m.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MULHU)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
func rewriteValueRISCV64_OpSelect1(v *Value) bool {
|
|
|
|
|
v_0 := v.Args[0]
|
2022-07-29 14:24:26 +08:00
|
|
|
b := v.Block
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Select1 (Add64carry x y c))
|
|
|
|
|
// result: (OR (SLTU <typ.UInt64> s:(ADD <typ.UInt64> x y) x) (SLTU <typ.UInt64> (ADD <typ.UInt64> s c) s))
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpAdd64carry {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := v_0.Args[2]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
v.reset(OpRISCV64OR)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLTU, typ.UInt64)
|
|
|
|
|
s := b.NewValue0(v.Pos, OpRISCV64ADD, typ.UInt64)
|
|
|
|
|
s.AddArg2(x, y)
|
|
|
|
|
v0.AddArg2(s, x)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTU, typ.UInt64)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64ADD, typ.UInt64)
|
|
|
|
|
v3.AddArg2(s, c)
|
|
|
|
|
v2.AddArg2(v3, s)
|
|
|
|
|
v.AddArg2(v0, v2)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-07-29 22:14:53 +08:00
|
|
|
// match: (Select1 (Sub64borrow x y c))
|
|
|
|
|
// result: (OR (SLTU <typ.UInt64> x s:(SUB <typ.UInt64> x y)) (SLTU <typ.UInt64> s (SUB <typ.UInt64> s c)))
|
|
|
|
|
for {
|
|
|
|
|
if v_0.Op != OpSub64borrow {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
c := v_0.Args[2]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
v.reset(OpRISCV64OR)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64SLTU, typ.UInt64)
|
|
|
|
|
s := b.NewValue0(v.Pos, OpRISCV64SUB, typ.UInt64)
|
|
|
|
|
s.AddArg2(x, y)
|
|
|
|
|
v0.AddArg2(x, s)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64SLTU, typ.UInt64)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.UInt64)
|
|
|
|
|
v3.AddArg2(s, c)
|
|
|
|
|
v2.AddArg2(s, v3)
|
|
|
|
|
v.AddArg2(v0, v2)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-24 22:17:51 +08:00
|
|
|
// match: (Select1 m:(LoweredMuluhilo x y))
|
|
|
|
|
// cond: m.Uses == 1
|
|
|
|
|
// result: (MUL x y)
|
|
|
|
|
for {
|
|
|
|
|
m := v_0
|
|
|
|
|
if m.Op != OpRISCV64LoweredMuluhilo {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
y := m.Args[1]
|
|
|
|
|
x := m.Args[0]
|
|
|
|
|
if !(m.Uses == 1) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MUL)
|
|
|
|
|
v.AddArg2(x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpSlicemask(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
// match: (Slicemask <t> x)
|
2022-08-29 20:36:51 +10:00
|
|
|
// result: (SRAI [63] (NEG <t> x))
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
|
|
|
|
t := v.Type
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
x := v_0
|
2022-08-29 20:36:51 +10:00
|
|
|
v.reset(OpRISCV64SRAI)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(63)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64NEG, t)
|
|
|
|
|
v0.AddArg(x)
|
2019-11-04 04:40:47 +11:00
|
|
|
v.AddArg(v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpStore(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_2 := v.Args[2]
|
|
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Store {t} ptr val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: t.Size() == 1
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVBstore ptr val mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(t.Size() == 1) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Store {t} ptr val mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// cond: t.Size() == 2
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVHstore ptr val mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2020-04-20 18:15:50 -04:00
|
|
|
if !(t.Size() == 2) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Store {t} ptr val mem)
|
2023-04-09 08:11:06 -07:00
|
|
|
// cond: t.Size() == 4 && !t.IsFloat()
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVWstore ptr val mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2023-04-09 08:11:06 -07:00
|
|
|
if !(t.Size() == 4 && !t.IsFloat()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Store {t} ptr val mem)
|
2023-04-09 08:11:06 -07:00
|
|
|
// cond: t.Size() == 8 && !t.IsFloat()
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (MOVDstore ptr val mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2023-04-09 08:11:06 -07:00
|
|
|
if !(t.Size() == 8 && !t.IsFloat()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Store {t} ptr val mem)
|
2023-04-09 08:11:06 -07:00
|
|
|
// cond: t.Size() == 4 && t.IsFloat()
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (FMOVWstore ptr val mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2023-04-09 08:11:06 -07:00
|
|
|
if !(t.Size() == 4 && t.IsFloat()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMOVWstore)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Store {t} ptr val mem)
|
2023-04-09 08:11:06 -07:00
|
|
|
// cond: t.Size() == 8 && t.IsFloat()
|
2019-11-04 04:40:47 +11:00
|
|
|
// result: (FMOVDstore ptr val mem)
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
val := v_1
|
|
|
|
|
mem := v_2
|
2023-04-09 08:11:06 -07:00
|
|
|
if !(t.Size() == 8 && t.IsFloat()) {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64FMOVDstore)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, val, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2020-01-21 20:53:30 -08:00
|
|
|
func rewriteValueRISCV64_OpZero(v *Value) bool {
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
v_1 := v.Args[1]
|
|
|
|
|
v_0 := v.Args[0]
|
2019-11-04 04:40:47 +11:00
|
|
|
b := v.Block
|
|
|
|
|
config := b.Func.Config
|
|
|
|
|
typ := &b.Func.Config.Types
|
|
|
|
|
// match: (Zero [0] _ mem)
|
|
|
|
|
// result: mem
|
|
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 0 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
mem := v_1
|
2019-10-30 10:29:47 -07:00
|
|
|
v.copyOf(mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [1] ptr mem)
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVBstore ptr (MOVDconst [0]) mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 1 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVBstore)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Zero [2] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVHstore ptr (MOVDconst [0]) mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 2 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVHstore)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Zero [2] ptr mem)
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVBstore [1] ptr (MOVDconst [0]) (MOVBstore ptr (MOVDconst [0]) mem))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 2 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(1)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v1.AddArg3(ptr, v0, mem)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [4] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%4 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVWstore ptr (MOVDconst [0]) mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 4 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(t.Alignment()%4 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVWstore)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Zero [4] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVHstore [2] ptr (MOVDconst [0]) (MOVHstore ptr (MOVDconst [0]) mem))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 4 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(2)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v1.AddArg3(ptr, v0, mem)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [4] ptr mem)
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVBstore [3] ptr (MOVDconst [0]) (MOVBstore [2] ptr (MOVDconst [0]) (MOVBstore [1] ptr (MOVDconst [0]) (MOVBstore ptr (MOVDconst [0]) mem))))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 4 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(3)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v3.AddArg3(ptr, v0, mem)
|
|
|
|
|
v2.AddArg3(ptr, v0, v3)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [8] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore ptr (MOVDconst [0]) mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
if auxIntToInt64(v.AuxInt) != 8 {
|
2019-11-04 04:40:47 +11:00
|
|
|
break
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
2020-10-29 01:10:49 +01:00
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-02-26 11:29:34 -08:00
|
|
|
v.AddArg3(ptr, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2020-10-29 01:10:49 +01:00
|
|
|
// match: (Zero [8] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%4 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVWstore [4] ptr (MOVDconst [0]) (MOVWstore ptr (MOVDconst [0]) mem))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 8 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%4 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(4)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
|
|
|
|
|
v1.AddArg3(ptr, v0, mem)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [8] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVHstore [6] ptr (MOVDconst [0]) (MOVHstore [4] ptr (MOVDconst [0]) (MOVHstore [2] ptr (MOVDconst [0]) (MOVHstore ptr (MOVDconst [0]) mem))))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 8 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(6)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v3.AddArg3(ptr, v0, mem)
|
|
|
|
|
v2.AddArg3(ptr, v0, v3)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [3] ptr mem)
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVBstore [2] ptr (MOVDconst [0]) (MOVBstore [1] ptr (MOVDconst [0]) (MOVBstore ptr (MOVDconst [0]) mem)))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 3 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
v.reset(OpRISCV64MOVBstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(2)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(1)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVBstore, types.TypeMem)
|
|
|
|
|
v2.AddArg3(ptr, v0, mem)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [6] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%2 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVHstore [4] ptr (MOVDconst [0]) (MOVHstore [2] ptr (MOVDconst [0]) (MOVHstore ptr (MOVDconst [0]) mem)))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 6 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%2 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVHstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(4)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(2)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVHstore, types.TypeMem)
|
|
|
|
|
v2.AddArg3(ptr, v0, mem)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [12] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%4 == 0
|
2021-03-18 03:37:58 +11:00
|
|
|
// result: (MOVWstore [8] ptr (MOVDconst [0]) (MOVWstore [4] ptr (MOVDconst [0]) (MOVWstore ptr (MOVDconst [0]) mem)))
|
2020-10-29 01:10:49 +01:00
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 12 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%4 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVWstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(8)
|
2021-03-18 03:37:58 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
2020-10-29 01:10:49 +01:00
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(4)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
|
|
|
|
|
v2.AddArg3(ptr, v0, mem)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [16] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 16 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v1.AddArg3(ptr, v0, mem)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [24] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore [16] ptr (MOVDconst [0]) (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem)))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 24 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(16)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v2.AddArg3(ptr, v0, mem)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [32] {t} ptr mem)
|
|
|
|
|
// cond: t.Alignment()%8 == 0
|
|
|
|
|
// result: (MOVDstore [24] ptr (MOVDconst [0]) (MOVDstore [16] ptr (MOVDconst [0]) (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem))))
|
|
|
|
|
for {
|
|
|
|
|
if auxIntToInt64(v.AuxInt) != 32 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
|
|
|
|
if !(t.Alignment()%8 == 0) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64MOVDstore)
|
|
|
|
|
v.AuxInt = int32ToAuxInt(24)
|
|
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(0)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v1.AuxInt = int32ToAuxInt(16)
|
|
|
|
|
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v2.AuxInt = int32ToAuxInt(8)
|
|
|
|
|
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
|
|
|
|
|
v3.AddArg3(ptr, v0, mem)
|
|
|
|
|
v2.AddArg3(ptr, v0, v3)
|
|
|
|
|
v1.AddArg3(ptr, v0, v2)
|
|
|
|
|
v.AddArg3(ptr, v0, v1)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
// match: (Zero [s] {t} ptr mem)
|
2020-10-29 01:10:49 +01:00
|
|
|
// cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice
|
2020-06-14 00:06:24 +02:00
|
|
|
// result: (DUFFZERO [8 * (128 - s/8)] ptr mem)
|
|
|
|
|
for {
|
|
|
|
|
s := auxIntToInt64(v.AuxInt)
|
|
|
|
|
t := auxToType(v.Aux)
|
|
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2020-10-29 01:10:49 +01:00
|
|
|
if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 && !config.noDuffDevice) {
|
2020-06-14 00:06:24 +02:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
v.reset(OpRISCV64DUFFZERO)
|
|
|
|
|
v.AuxInt = int64ToAuxInt(8 * (128 - s/8))
|
|
|
|
|
v.AddArg2(ptr, mem)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (Zero [s] {t} ptr mem)
|
2020-04-20 18:15:50 -04:00
|
|
|
// result: (LoweredZero [t.Alignment()] ptr (ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
2020-04-20 18:15:50 -04:00
|
|
|
s := auxIntToInt64(v.AuxInt)
|
|
|
|
|
t := auxToType(v.Aux)
|
cmd/compile: reduce bounds checks in generated rewrite rules
CL 213703 converted generated rewrite rules for commutative ops
to use loops instead of duplicated code.
However, it loaded args using expressions like
v.Args[i] and v.Args[i^1], which the compiler could
not eliminate bounds for (including with all outstanding
prove CLs).
Also, given a series of separate rewrite rules for the same op,
we generated bounds checks for every rewrite rule, even though
we were repeatedly loading the same set of args.
This change reduces both sets of bounds checks.
Instead of loading v.Args[i] and v.Args[i^1] for commutative loops,
we now preload v.Args[0] and v.Args[1] into local variables,
and then swap them (as needed) in the commutative loop post statement.
And we now load all top level v.Args into local variables
at the beginning of every rewrite rule function.
The second optimization is the more significant,
but the first helps a little, and they play together
nicely from the perspective of generating the code.
This does increase register pressure, but the reduced bounds
checks more than compensate.
Note that the vast majority of rewrite rules evaluated
are not applied, so the prologue is the most important
part of the rewrite rules.
There is one subtle aspect to the new generated code.
Because the top level v.Args are shared across rewrite rules,
and rule evaluation can swap v_0 and v_1, v_0 and v_1
can end up being swapped from one rule to the next.
That is OK, because any time a rule does not get applied,
they will have been swapped exactly twice.
Passes toolstash-check -all.
name old time/op new time/op delta
Template 213ms ± 2% 211ms ± 2% -0.85% (p=0.000 n=92+96)
Unicode 83.5ms ± 2% 83.2ms ± 2% -0.41% (p=0.004 n=95+90)
GoTypes 737ms ± 2% 733ms ± 2% -0.51% (p=0.000 n=91+94)
Compiler 3.45s ± 2% 3.43s ± 2% -0.44% (p=0.000 n=99+100)
SSA 8.54s ± 1% 8.32s ± 2% -2.56% (p=0.000 n=96+99)
Flate 136ms ± 2% 135ms ± 1% -0.47% (p=0.000 n=96+96)
GoParser 169ms ± 1% 168ms ± 1% -0.33% (p=0.000 n=96+93)
Reflect 456ms ± 3% 455ms ± 3% ~ (p=0.261 n=95+94)
Tar 186ms ± 2% 185ms ± 2% -0.48% (p=0.000 n=94+95)
XML 251ms ± 1% 250ms ± 1% -0.51% (p=0.000 n=91+94)
[Geo mean] 424ms 421ms -0.68%
name old user-time/op new user-time/op delta
Template 275ms ± 1% 274ms ± 2% -0.55% (p=0.000 n=95+98)
Unicode 118ms ± 4% 118ms ± 4% ~ (p=0.642 n=98+90)
GoTypes 983ms ± 1% 980ms ± 1% -0.30% (p=0.000 n=93+93)
Compiler 4.56s ± 6% 4.52s ± 6% -0.72% (p=0.003 n=100+100)
SSA 11.4s ± 1% 11.1s ± 1% -2.50% (p=0.000 n=96+97)
Flate 168ms ± 1% 167ms ± 1% -0.49% (p=0.000 n=92+92)
GoParser 204ms ± 1% 204ms ± 2% -0.27% (p=0.003 n=99+96)
Reflect 599ms ± 2% 598ms ± 2% ~ (p=0.116 n=95+92)
Tar 227ms ± 2% 225ms ± 2% -0.57% (p=0.000 n=95+98)
XML 313ms ± 2% 312ms ± 1% -0.37% (p=0.000 n=89+95)
[Geo mean] 547ms 544ms -0.61%
file before after Δ %
compile 21113112 21109016 -4096 -0.019%
total 131704940 131700844 -4096 -0.003%
Change-Id: Id6c39e0367e597c0c75b8a4b1eb14cc3cbd11956
Reviewed-on: https://go-review.googlesource.com/c/go/+/216218
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2020-01-20 20:09:41 -08:00
|
|
|
ptr := v_0
|
|
|
|
|
mem := v_1
|
2019-11-04 04:40:47 +11:00
|
|
|
v.reset(OpRISCV64LoweredZero)
|
2020-04-20 18:15:50 -04:00
|
|
|
v.AuxInt = int64ToAuxInt(t.Alignment())
|
2019-11-04 04:40:47 +11:00
|
|
|
v0 := b.NewValue0(v.Pos, OpRISCV64ADD, ptr.Type)
|
|
|
|
|
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
2020-04-20 18:15:50 -04:00
|
|
|
v1.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config))
|
2020-02-26 11:29:34 -08:00
|
|
|
v0.AddArg2(ptr, v1)
|
|
|
|
|
v.AddArg3(ptr, v0, mem)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
func rewriteBlockRISCV64(b *Block) bool {
|
2022-05-10 10:49:33 -04:00
|
|
|
typ := &b.Func.Config.Types
|
2019-11-04 04:40:47 +11:00
|
|
|
switch b.Kind {
|
2020-03-31 02:04:45 +11:00
|
|
|
case BlockRISCV64BEQ:
|
|
|
|
|
// match: (BEQ (MOVDconst [0]) cond yes no)
|
|
|
|
|
// result: (BEQZ cond yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64MOVDconst {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
if auxIntToInt64(v_0.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
cond := b.Controls[1]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BEQ cond (MOVDconst [0]) yes no)
|
|
|
|
|
// result: (BEQZ cond yes no)
|
|
|
|
|
for b.Controls[1].Op == OpRISCV64MOVDconst {
|
|
|
|
|
cond := b.Controls[0]
|
|
|
|
|
v_1 := b.Controls[1]
|
|
|
|
|
if auxIntToInt64(v_1.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
case BlockRISCV64BEQZ:
|
|
|
|
|
// match: (BEQZ (SEQZ x) yes no)
|
|
|
|
|
// result: (BNEZ x yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SEQZ {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BEQZ (SNEZ x) yes no)
|
|
|
|
|
// result: (BEQZ x yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SNEZ {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 06:08:02 +10:00
|
|
|
// match: (BEQZ (NEG x) yes no)
|
|
|
|
|
// result: (BEQZ x yes no)
|
2021-08-17 19:09:33 +10:00
|
|
|
for b.Controls[0].Op == OpRISCV64NEG {
|
2022-08-28 06:08:02 +10:00
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, x)
|
2021-08-17 19:09:33 +10:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 02:32:06 +10:00
|
|
|
// match: (BEQZ (FNES <t> x y) yes no)
|
|
|
|
|
// result: (BNEZ (FEQS <t> x y) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64FNES {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
t := v_0.Type
|
|
|
|
|
_ = v_0.Args[1]
|
|
|
|
|
v_0_0 := v_0.Args[0]
|
|
|
|
|
v_0_1 := v_0.Args[1]
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
|
|
|
|
|
x := v_0_0
|
|
|
|
|
y := v_0_1
|
|
|
|
|
v0 := b.NewValue0(v_0.Pos, OpRISCV64FEQS, t)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// match: (BEQZ (FNED <t> x y) yes no)
|
|
|
|
|
// result: (BNEZ (FEQD <t> x y) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64FNED {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
t := v_0.Type
|
|
|
|
|
_ = v_0.Args[1]
|
|
|
|
|
v_0_0 := v_0.Args[0]
|
|
|
|
|
v_0_1 := v_0.Args[1]
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
|
|
|
|
|
x := v_0_0
|
|
|
|
|
y := v_0_1
|
|
|
|
|
v0 := b.NewValue0(v_0.Pos, OpRISCV64FEQD, t)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-03-31 02:04:45 +11:00
|
|
|
// match: (BEQZ (SUB x y) yes no)
|
|
|
|
|
// result: (BEQ x y yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SUB {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BEQ, x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BEQZ (SLT x y) yes no)
|
|
|
|
|
// result: (BGE x y yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLT {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BGE, x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BEQZ (SLTU x y) yes no)
|
|
|
|
|
// result: (BGEU x y yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLTU {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BGEU, x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-05 03:03:35 +10:00
|
|
|
// match: (BEQZ (SLTI [x] y) yes no)
|
|
|
|
|
// result: (BGE y (MOVDconst [x]) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLTI {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
y := v_0.Args[0]
|
|
|
|
|
v0 := b.NewValue0(b.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(x)
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BGE, y, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BEQZ (SLTIU [x] y) yes no)
|
|
|
|
|
// result: (BGEU y (MOVDconst [x]) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLTIU {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
y := v_0.Args[0]
|
|
|
|
|
v0 := b.NewValue0(b.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(x)
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BGEU, y, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2021-08-17 19:01:52 +10:00
|
|
|
case BlockRISCV64BGE:
|
|
|
|
|
// match: (BGE (MOVDconst [0]) cond yes no)
|
|
|
|
|
// result: (BLEZ cond yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64MOVDconst {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
if auxIntToInt64(v_0.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
cond := b.Controls[1]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BLEZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BGE cond (MOVDconst [0]) yes no)
|
|
|
|
|
// result: (BGEZ cond yes no)
|
|
|
|
|
for b.Controls[1].Op == OpRISCV64MOVDconst {
|
|
|
|
|
cond := b.Controls[0]
|
|
|
|
|
v_1 := b.Controls[1]
|
|
|
|
|
if auxIntToInt64(v_1.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b.resetWithControl(BlockRISCV64BGEZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
case BlockRISCV64BLT:
|
|
|
|
|
// match: (BLT (MOVDconst [0]) cond yes no)
|
|
|
|
|
// result: (BGTZ cond yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64MOVDconst {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
if auxIntToInt64(v_0.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
cond := b.Controls[1]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BGTZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BLT cond (MOVDconst [0]) yes no)
|
|
|
|
|
// result: (BLTZ cond yes no)
|
|
|
|
|
for b.Controls[1].Op == OpRISCV64MOVDconst {
|
|
|
|
|
cond := b.Controls[0]
|
|
|
|
|
v_1 := b.Controls[1]
|
|
|
|
|
if auxIntToInt64(v_1.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b.resetWithControl(BlockRISCV64BLTZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-31 02:04:45 +11:00
|
|
|
case BlockRISCV64BNE:
|
|
|
|
|
// match: (BNE (MOVDconst [0]) cond yes no)
|
|
|
|
|
// result: (BNEZ cond yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64MOVDconst {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
if auxIntToInt64(v_0.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
cond := b.Controls[1]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BNE cond (MOVDconst [0]) yes no)
|
|
|
|
|
// result: (BNEZ cond yes no)
|
|
|
|
|
for b.Controls[1].Op == OpRISCV64MOVDconst {
|
|
|
|
|
cond := b.Controls[0]
|
|
|
|
|
v_1 := b.Controls[1]
|
|
|
|
|
if auxIntToInt64(v_1.AuxInt) != 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, cond)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-31 02:00:50 +11:00
|
|
|
case BlockRISCV64BNEZ:
|
2020-03-31 02:04:45 +11:00
|
|
|
// match: (BNEZ (SEQZ x) yes no)
|
|
|
|
|
// result: (BEQZ x yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SEQZ {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, x)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2020-03-31 02:00:50 +11:00
|
|
|
// match: (BNEZ (SNEZ x) yes no)
|
|
|
|
|
// result: (BNEZ x yes no)
|
2020-03-02 04:25:54 +11:00
|
|
|
for b.Controls[0].Op == OpRISCV64SNEZ {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := v_0.Args[0]
|
2020-03-31 02:00:50 +11:00
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, x)
|
2020-03-02 04:25:54 +11:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 06:08:02 +10:00
|
|
|
// match: (BNEZ (NEG x) yes no)
|
|
|
|
|
// result: (BNEZ x yes no)
|
2021-08-17 19:09:33 +10:00
|
|
|
for b.Controls[0].Op == OpRISCV64NEG {
|
2022-08-28 06:08:02 +10:00
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, x)
|
2021-08-17 19:09:33 +10:00
|
|
|
return true
|
|
|
|
|
}
|
2022-08-28 02:32:06 +10:00
|
|
|
// match: (BNEZ (FNES <t> x y) yes no)
|
|
|
|
|
// result: (BEQZ (FEQS <t> x y) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64FNES {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
t := v_0.Type
|
|
|
|
|
_ = v_0.Args[1]
|
|
|
|
|
v_0_0 := v_0.Args[0]
|
|
|
|
|
v_0_1 := v_0.Args[1]
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
|
|
|
|
|
x := v_0_0
|
|
|
|
|
y := v_0_1
|
|
|
|
|
v0 := b.NewValue0(v_0.Pos, OpRISCV64FEQS, t)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// match: (BNEZ (FNED <t> x y) yes no)
|
|
|
|
|
// result: (BEQZ (FEQD <t> x y) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64FNED {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
t := v_0.Type
|
|
|
|
|
_ = v_0.Args[1]
|
|
|
|
|
v_0_0 := v_0.Args[0]
|
|
|
|
|
v_0_1 := v_0.Args[1]
|
|
|
|
|
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
|
|
|
|
|
x := v_0_0
|
|
|
|
|
y := v_0_1
|
|
|
|
|
v0 := b.NewValue0(v_0.Pos, OpRISCV64FEQD, t)
|
|
|
|
|
v0.AddArg2(x, y)
|
|
|
|
|
b.resetWithControl(BlockRISCV64BEQZ, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-03-31 02:04:45 +11:00
|
|
|
// match: (BNEZ (SUB x y) yes no)
|
|
|
|
|
// result: (BNE x y yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SUB {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BNE, x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BNEZ (SLT x y) yes no)
|
|
|
|
|
// result: (BLT x y yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLT {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BLT, x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BNEZ (SLTU x y) yes no)
|
|
|
|
|
// result: (BLTU x y yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLTU {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
y := v_0.Args[1]
|
|
|
|
|
x := v_0.Args[0]
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BLTU, x, y)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2022-09-05 03:03:35 +10:00
|
|
|
// match: (BNEZ (SLTI [x] y) yes no)
|
|
|
|
|
// result: (BLT y (MOVDconst [x]) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLTI {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
y := v_0.Args[0]
|
|
|
|
|
v0 := b.NewValue0(b.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(x)
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BLT, y, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// match: (BNEZ (SLTIU [x] y) yes no)
|
|
|
|
|
// result: (BLTU y (MOVDconst [x]) yes no)
|
|
|
|
|
for b.Controls[0].Op == OpRISCV64SLTIU {
|
|
|
|
|
v_0 := b.Controls[0]
|
|
|
|
|
x := auxIntToInt64(v_0.AuxInt)
|
|
|
|
|
y := v_0.Args[0]
|
|
|
|
|
v0 := b.NewValue0(b.Pos, OpRISCV64MOVDconst, typ.UInt64)
|
|
|
|
|
v0.AuxInt = int64ToAuxInt(x)
|
|
|
|
|
b.resetWithControl2(BlockRISCV64BLTU, y, v0)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2019-11-04 04:40:47 +11:00
|
|
|
case BlockIf:
|
|
|
|
|
// match: (If cond yes no)
|
2022-05-10 10:49:33 -04:00
|
|
|
// result: (BNEZ (MOVBUreg <typ.UInt64> cond) yes no)
|
2019-11-04 04:40:47 +11:00
|
|
|
for {
|
|
|
|
|
cond := b.Controls[0]
|
2022-05-10 10:49:33 -04:00
|
|
|
v0 := b.NewValue0(cond.Pos, OpRISCV64MOVBUreg, typ.UInt64)
|
|
|
|
|
v0.AddArg(cond)
|
|
|
|
|
b.resetWithControl(BlockRISCV64BNEZ, v0)
|
2019-11-04 04:40:47 +11:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|