cmd/compile: improve LoweredZero performance for ppc64x

This change improves the performance of the LoweredZero rule
on ppc64x.

The improvement can be seen in the runtime ClearFat
benchmarks:

BenchmarkClearFat12-16       2.40          0.69          -71.25%
BenchmarkClearFat16-16       9.98          0.93          -90.68%
BenchmarkClearFat24-16       4.75          0.93          -80.42%
BenchmarkClearFat32-16       6.02          0.93          -84.55%
BenchmarkClearFat40-16       7.19          1.16          -83.87%
BenchmarkClearFat48-16       15.0          1.39          -90.73%
BenchmarkClearFat56-16       9.95          1.62          -83.72%
BenchmarkClearFat64-16       18.0          1.86          -89.67%
BenchmarkClearFat128-16      30.0          8.08          -73.07%
BenchmarkClearFat256-16      52.5          11.3          -78.48%
BenchmarkClearFat512-16      97.0          19.0          -80.41%
BenchmarkClearFat1024-16     244           34.2          -85.98%

Fixes: #19532

Change-Id: If493e28bc1d8e61bc79978498be9f5336a36cd3f
Reviewed-on: https://go-review.googlesource.com/38096
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Munday <munday@ca.ibm.com>
This commit is contained in:
Lynn Boger 2017-03-13 10:16:30 -04:00
parent d972dc2de9
commit 23bd919136
5 changed files with 426 additions and 305 deletions

View file

@ -312,19 +312,37 @@ func init() {
// large or unaligned zeroing
// arg0 = address of memory to zero (in R3, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// returns mem
// ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
// MOVDU R0, 8(R3)
// CMP R3, Rarg1
// BLE -2(PC)
//
// a loop is generated when there is more than one iteration
// needed to clear 4 doublewords
//
// MOVD $len/32,R31
// MOVD R31,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD R3,32
// BC loop
// remaining doubleword clears generated as needed
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// one or more of these to clear remainder < 8 bytes
// MOVW R0,n1(R3)
// MOVH R0,n2(R3)
// MOVB R0,n3(R3)
{
name: "LoweredZero",
aux: "Int64",
argLength: 3,
argLength: 2,
reg: regInfo{
inputs: []regMask{buildReg("R3"), gp},
inputs: []regMask{buildReg("R3")},
clobbers: buildReg("R3"),
},
clobberFlags: true,