go/src/cmd/compile/internal/gc/asm_test.go

1089 lines
20 KiB
Go
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gc
import (
"bytes"
"fmt"
"internal/testenv"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strings"
"testing"
)
// This file contains code generation tests.
//
// Each test is defined in a variable of type asmTest. Tests are
// architecture-specific, and they are grouped in arrays of tests, one
// for each architecture.
//
// Each asmTest consists of a function to compile, an array of
// positive regexps that must match the generated assembly and
// an array of negative regexps that must not match generated assembly.
// For example, the following amd64 test
//
// {
// fn: `
// func f0(x int) int {
// return x * 64
// }
// `,
// pos: []string{"\tSHLQ\t[$]6,"},
// neg: []string{"MULQ"}
// }
//
// verifies that the code the compiler generates for a multiplication
// by 64 contains a 'SHLQ' instruction and does not contain a MULQ.
//
// Since all the tests for a given architecture are dumped in the same
// file, the function names must be unique. As a workaround for this
// restriction, the test harness supports the use of a '$' placeholder
// for function names. The func f0 above can be also written as
//
// {
// fn: `
// func $(x int) int {
// return x * 64
// }
// `,
// pos: []string{"\tSHLQ\t[$]6,"},
// neg: []string{"MULQ"}
// }
//
// Each '$'-function will be given a unique name of form f<N>_<arch>,
// where <N> is the test index in the test array, and <arch> is the
// test's architecture.
//
// It is allowed to mix named and unnamed functions in the same test
// array; the named functions will retain their original names.
// TestAssembly checks to make sure the assembly generated for
// functions contains certain expected instructions.
func TestAssembly(t *testing.T) {
testenv.MustHaveGoBuild(t)
if runtime.GOOS == "windows" {
// TODO: remove if we can get "go tool compile -S" to work on windows.
t.Skipf("skipping test: recursive windows compile not working")
}
dir, err := ioutil.TempDir("", "TestAssembly")
if err != nil {
t.Fatalf("could not create directory: %v", err)
}
defer os.RemoveAll(dir)
nameRegexp := regexp.MustCompile("func \\w+")
t.Run("platform", func(t *testing.T) {
for _, ats := range allAsmTests {
ats := ats
t.Run(ats.os+"/"+ats.arch, func(tt *testing.T) {
tt.Parallel()
asm := ats.compileToAsm(tt, dir)
for i, at := range ats.tests {
var funcName string
if strings.Contains(at.fn, "func $") {
funcName = fmt.Sprintf("f%d_%s", i, ats.arch)
} else {
funcName = nameRegexp.FindString(at.fn)[len("func "):]
}
fa := funcAsm(tt, asm, funcName)
if fa != "" {
at.verifyAsm(tt, fa)
}
}
})
}
})
}
var nextTextRegexp = regexp.MustCompile(`\n\S`)
// funcAsm returns the assembly listing for the given function name.
func funcAsm(t *testing.T, asm string, funcName string) string {
if i := strings.Index(asm, fmt.Sprintf("TEXT\t\"\".%s(SB)", funcName)); i >= 0 {
asm = asm[i:]
} else {
t.Errorf("could not find assembly for function %v", funcName)
return ""
}
// Find the next line that doesn't begin with whitespace.
loc := nextTextRegexp.FindStringIndex(asm)
if loc != nil {
asm = asm[:loc[0]]
}
return asm
}
type asmTest struct {
// function to compile
fn string
// regular expressions that must match the generated assembly
pos []string
// regular expressions that must not match the generated assembly
neg []string
}
func (at asmTest) verifyAsm(t *testing.T, fa string) {
for _, r := range at.pos {
if b, err := regexp.MatchString(r, fa); !b || err != nil {
t.Errorf("expected:%s\ngo:%s\nasm:%s\n", r, at.fn, fa)
}
}
for _, r := range at.neg {
if b, err := regexp.MatchString(r, fa); b || err != nil {
t.Errorf("not expected:%s\ngo:%s\nasm:%s\n", r, at.fn, fa)
}
}
}
type asmTests struct {
arch string
os string
imports []string
tests []*asmTest
}
func (ats *asmTests) generateCode() []byte {
var buf bytes.Buffer
fmt.Fprintln(&buf, "package main")
for _, s := range ats.imports {
fmt.Fprintf(&buf, "import %q\n", s)
}
for i, t := range ats.tests {
function := strings.Replace(t.fn, "func $", fmt.Sprintf("func f%d_%s", i, ats.arch), 1)
fmt.Fprintln(&buf, function)
}
return buf.Bytes()
}
// compile compiles the package pkg for architecture arch and
// returns the generated assembly. dir is a scratch directory.
func (ats *asmTests) compileToAsm(t *testing.T, dir string) string {
// create test directory
testDir := filepath.Join(dir, fmt.Sprintf("%s_%s", ats.arch, ats.os))
err := os.Mkdir(testDir, 0700)
if err != nil {
t.Fatalf("could not create directory: %v", err)
}
// Create source.
src := filepath.Join(testDir, "test.go")
err = ioutil.WriteFile(src, ats.generateCode(), 0600)
if err != nil {
t.Fatalf("error writing code: %v", err)
}
// First, install any dependencies we need. This builds the required export data
// for any packages that are imported.
for _, i := range ats.imports {
out := filepath.Join(testDir, i+".a")
if s := ats.runGo(t, "build", "-o", out, "-gcflags=-dolinkobj=false", i); s != "" {
t.Fatalf("Stdout = %s\nWant empty", s)
}
}
// Now, compile the individual file for which we want to see the generated assembly.
asm := ats.runGo(t, "tool", "compile", "-I", testDir, "-S", "-o", filepath.Join(testDir, "out.o"), src)
return asm
}
// runGo runs go command with the given args and returns stdout string.
// go is run with GOARCH and GOOS set as ats.arch and ats.os respectively
func (ats *asmTests) runGo(t *testing.T, args ...string) string {
var stdout, stderr bytes.Buffer
cmd := exec.Command(testenv.GoToolPath(t), args...)
cmd.Env = append(os.Environ(), "GOARCH="+ats.arch, "GOOS="+ats.os)
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
t.Fatalf("error running cmd: %v\nstdout:\n%sstderr:\n%s\n", err, stdout.String(), stderr.String())
}
if s := stderr.String(); s != "" {
t.Fatalf("Stderr = %s\nWant empty", s)
}
return stdout.String()
}
var allAsmTests = []*asmTests{
{
arch: "amd64",
os: "linux",
imports: []string{"unsafe", "runtime"},
tests: linuxAMD64Tests,
},
{
arch: "386",
os: "linux",
tests: linux386Tests,
},
{
arch: "s390x",
os: "linux",
tests: linuxS390XTests,
},
{
arch: "arm",
os: "linux",
imports: []string{"runtime"},
tests: linuxARMTests,
},
{
arch: "arm64",
os: "linux",
tests: linuxARM64Tests,
},
{
arch: "mips",
os: "linux",
tests: linuxMIPSTests,
},
{
arch: "mips64",
os: "linux",
tests: linuxMIPS64Tests,
},
{
arch: "ppc64le",
os: "linux",
tests: linuxPPC64LETests,
},
{
arch: "amd64",
os: "plan9",
tests: plan9AMD64Tests,
},
}
var linuxAMD64Tests = []*asmTest{
{
fn: `
func $(x int) int {
return x * 96
}
`,
pos: []string{"\tSHLQ\t\\$5,", "\tLEAQ\t\\(.*\\)\\(.*\\*2\\),"},
},
{
fn: `
func f33(m map[int]int) int {
return m[5]
}
`,
pos: []string{"\tMOVQ\t[$]5,"},
},
// Direct use of constants in fast map access calls. Issue 19015.
{
fn: `
func f34(m map[int]int) bool {
_, ok := m[5]
return ok
}
`,
pos: []string{"\tMOVQ\t[$]5,"},
},
{
fn: `
func f35(m map[string]int) int {
return m["abc"]
}
`,
pos: []string{"\"abc\""},
},
{
fn: `
func f36(m map[string]int) bool {
_, ok := m["abc"]
return ok
}
`,
pos: []string{"\"abc\""},
},
// Bit test ops on amd64, issue 18943.
{
fn: `
func f37(a, b uint64) int {
if a&(1<<(b&63)) != 0 {
return 1
}
return -1
}
`,
pos: []string{"\tBTQ\t"},
},
{
fn: `
func f38(a, b uint64) bool {
return a&(1<<(b&63)) != 0
}
`,
pos: []string{"\tBTQ\t"},
},
{
fn: `
func f39(a uint64) int {
if a&(1<<60) != 0 {
return 1
}
return -1
}
`,
pos: []string{"\tBTQ\t\\$60"},
},
{
fn: `
func f40(a uint64) bool {
return a&(1<<60) != 0
}
`,
pos: []string{"\tBTQ\t\\$60"},
},
// see issue 19595.
// We want to merge load+op in f58, but not in f59.
{
fn: `
func f58(p, q *int) {
x := *p
*q += x
}`,
pos: []string{"\tADDQ\t\\("},
},
{
fn: `
func f59(p, q *int) {
x := *p
for i := 0; i < 10; i++ {
*q += x
}
}`,
pos: []string{"\tADDQ\t[A-Z]"},
},
// Check that compare to constant string uses 2/4/8 byte compares
{
fn: `
func f65(a string) bool {
return a == "xx"
}`,
pos: []string{"\tCMPW\t\\(.*\\), [$]"},
},
{
fn: `
func f66(a string) bool {
return a == "xxxx"
}`,
pos: []string{"\tCMPL\t\\(.*\\), [$]"},
},
{
fn: `
func f67(a string) bool {
return a == "xxxxxxxx"
}`,
pos: []string{"\tCMPQ\t[A-Z]"},
},
// Check that array compare uses 2/4/8 byte compares
{
fn: `
func f68(a,b [2]byte) bool {
return a == b
}`,
pos: []string{"\tCMPW\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]"},
},
{
fn: `
func f69(a,b [3]uint16) bool {
return a == b
}`,
pos: []string{
"\tCMPL\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]",
"\tCMPW\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]",
},
},
{
fn: `
func $(a,b [3]int16) bool {
return a == b
}`,
pos: []string{
"\tCMPL\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]",
"\tCMPW\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]",
},
},
{
fn: `
func $(a,b [12]int8) bool {
return a == b
}`,
pos: []string{
"\tCMPQ\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]",
"\tCMPL\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]",
},
},
{
fn: `
func f70(a,b [15]byte) bool {
return a == b
}`,
pos: []string{"\tCMPQ\t\"\"[.+_a-z0-9]+\\(SP\\), [A-Z]"},
},
{
fn: `
func f71(a,b unsafe.Pointer) bool { // This was a TODO in mapaccess1_faststr
return *((*[4]byte)(a)) != *((*[4]byte)(b))
}`,
pos: []string{"\tCMPL\t\\(.*\\), [A-Z]"},
},
{
// make sure assembly output has matching offset and base register.
fn: `
func f72(a, b int) int {
runtime.GC() // use some frame
return b
}
`,
pos: []string{"b\\+24\\(SP\\)"},
},
{
// check load combining
fn: `
func f73(a, b byte) (byte,byte) {
return f73(f73(a,b))
}
`,
pos: []string{"\tMOVW\t"},
},
{
fn: `
func f74(a, b uint16) (uint16,uint16) {
return f74(f74(a,b))
}
`,
pos: []string{"\tMOVL\t"},
},
{
fn: `
func f75(a, b uint32) (uint32,uint32) {
return f75(f75(a,b))
}
`,
pos: []string{"\tMOVQ\t"},
},
// Make sure we don't put pointers in SSE registers across safe points.
{
fn: `
func $(p, q *[2]*int) {
a, b := p[0], p[1]
runtime.GC()
q[0], q[1] = a, b
}
`,
neg: []string{"MOVUPS"},
},
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]0-8"},
},
// int <-> fp moves
{
fn: `
func $(x uint32) bool {
return x > 4
}
`,
pos: []string{"\tSETHI\t.*\\(SP\\)"},
},
{
fn: `
func $(p int, q *int) bool {
return p < *q
}
`,
pos: []string{"CMPQ\t\\(.*\\), [A-Z]"},
},
{
fn: `
func $(p *int, q int) bool {
return *p < q
}
`,
pos: []string{"CMPQ\t\\(.*\\), [A-Z]"},
},
{
fn: `
func $(p *int) bool {
return *p < 7
}
`,
pos: []string{"CMPQ\t\\(.*\\), [$]7"},
},
{
fn: `
func $(p *int) bool {
return 7 < *p
}
`,
pos: []string{"CMPQ\t\\(.*\\), [$]7"},
},
{
fn: `
func $(p **int) {
*p = nil
}
`,
pos: []string{"CMPL\truntime.writeBarrier\\(SB\\), [$]0"},
},
}
var linux386Tests = []*asmTest{
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]0-4"},
},
}
var linuxS390XTests = []*asmTest{
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]0-8"},
},
}
var linuxARMTests = []*asmTest{
{
// make sure assembly output has matching offset and base register.
fn: `
func f13(a, b int) int {
runtime.GC() // use some frame
return b
}
`,
pos: []string{"b\\+4\\(FP\\)"},
},
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]-4-4"},
},
}
var linuxARM64Tests = []*asmTest{
cmd/compile: optimize ARM64 code with EON/ORN EON and ORN are efficient ARM64 instructions. EON combines (x ^ ^y) into a single operation, and so ORN does for (x | ^y). This CL implements that optimization. And here are benchmark results with RaspberryPi3/ArchLinux. 1. A specific test gets about 13% improvement. EONORN 181µs ± 0% 157µs ± 0% -13.26% (p=0.000 n=26+23) (https://github.com/benshi001/ugo1/blob/master/eonorn_test.go) 2. There is little change in the go1 benchmark, excluding noise. name old time/op new time/op delta BinaryTree17-4 44.1s ± 2% 44.0s ± 2% ~ (p=0.513 n=30+30) Fannkuch11-4 32.9s ± 3% 32.8s ± 3% -0.12% (p=0.024 n=30+30) FmtFprintfEmpty-4 561ns ± 9% 558ns ± 9% ~ (p=0.654 n=30+30) FmtFprintfString-4 1.09µs ± 4% 1.09µs ± 3% ~ (p=0.158 n=30+30) FmtFprintfInt-4 1.12µs ± 0% 1.12µs ± 0% ~ (p=0.917 n=23+28) FmtFprintfIntInt-4 1.73µs ± 0% 1.76µs ± 4% ~ (p=0.665 n=23+30) FmtFprintfPrefixedInt-4 2.15µs ± 1% 2.15µs ± 0% ~ (p=0.389 n=27+26) FmtFprintfFloat-4 3.18µs ± 4% 3.13µs ± 0% -1.50% (p=0.003 n=30+23) FmtManyArgs-4 7.32µs ± 4% 7.21µs ± 0% ~ (p=0.220 n=30+25) GobDecode-4 99.1ms ± 9% 97.0ms ± 0% -2.07% (p=0.000 n=30+23) GobEncode-4 83.3ms ± 3% 82.4ms ± 4% ~ (p=0.321 n=30+30) Gzip-4 4.39s ± 4% 4.32s ± 2% -1.42% (p=0.017 n=30+23) Gunzip-4 440ms ± 0% 447ms ± 4% +1.54% (p=0.006 n=24+30) HTTPClientServer-4 547µs ± 1% 537µs ± 1% -1.91% (p=0.000 n=30+30) JSONEncode-4 211ms ± 0% 211ms ± 0% +0.04% (p=0.000 n=23+24) JSONDecode-4 847ms ± 0% 847ms ± 0% ~ (p=0.158 n=25+25) Mandelbrot200-4 46.5ms ± 0% 46.5ms ± 0% -0.04% (p=0.000 n=25+24) GoParse-4 43.4ms ± 0% 43.4ms ± 0% ~ (p=0.494 n=24+25) RegexpMatchEasy0_32-4 1.03µs ± 0% 1.03µs ± 0% ~ (all equal) RegexpMatchEasy0_1K-4 4.02µs ± 3% 3.98µs ± 0% -0.95% (p=0.003 n=30+24) RegexpMatchEasy1_32-4 1.01µs ± 3% 1.01µs ± 2% ~ (p=0.629 n=30+30) RegexpMatchEasy1_1K-4 6.39µs ± 0% 6.39µs ± 0% ~ (p=0.564 n=24+23) RegexpMatchMedium_32-4 1.80µs ± 3% 1.78µs ± 0% ~ (p=0.155 n=30+24) RegexpMatchMedium_1K-4 555µs ± 0% 563µs ± 3% +1.55% (p=0.004 n=27+30) RegexpMatchHard_32-4 31.0µs ± 4% 30.5µs ± 1% -1.58% (p=0.000 n=30+23) RegexpMatchHard_1K-4 947µs ± 4% 931µs ± 0% -1.66% (p=0.009 n=30+24) Revcomp-4 7.71s ± 4% 7.71s ± 4% ~ (p=0.196 n=29+30) Template-4 877ms ± 0% 878ms ± 0% +0.16% (p=0.018 n=23+27) TimeParse-4 4.75µs ± 1% 4.74µs ± 0% ~ (p=0.895 n=24+23) TimeFormat-4 4.83µs ± 4% 4.83µs ± 4% ~ (p=0.767 n=30+30) [Geo mean] 709µs 707µs -0.35% name old speed new speed delta GobDecode-4 7.75MB/s ± 8% 7.91MB/s ± 0% +2.03% (p=0.001 n=30+23) GobEncode-4 9.22MB/s ± 3% 9.32MB/s ± 4% ~ (p=0.389 n=30+30) Gzip-4 4.43MB/s ± 4% 4.43MB/s ± 4% ~ (p=0.888 n=30+30) Gunzip-4 44.1MB/s ± 0% 43.4MB/s ± 4% -1.46% (p=0.009 n=24+30) JSONEncode-4 9.18MB/s ± 0% 9.18MB/s ± 0% ~ (p=0.308 n=16+24) JSONDecode-4 2.29MB/s ± 0% 2.29MB/s ± 0% ~ (all equal) GoParse-4 1.33MB/s ± 0% 1.33MB/s ± 0% ~ (all equal) RegexpMatchEasy0_32-4 30.9MB/s ± 0% 30.9MB/s ± 0% ~ (p=1.000 n=23+24) RegexpMatchEasy0_1K-4 255MB/s ± 3% 257MB/s ± 0% +0.92% (p=0.004 n=30+24) RegexpMatchEasy1_32-4 31.7MB/s ± 3% 31.6MB/s ± 2% ~ (p=0.603 n=30+30) RegexpMatchEasy1_1K-4 160MB/s ± 0% 160MB/s ± 0% ~ (p=0.435 n=24+23) RegexpMatchMedium_32-4 554kB/s ± 3% 560kB/s ± 0% +1.08% (p=0.004 n=30+24) RegexpMatchMedium_1K-4 1.85MB/s ± 0% 1.82MB/s ± 3% -1.48% (p=0.001 n=27+30) RegexpMatchHard_32-4 1.03MB/s ± 4% 1.05MB/s ± 1% +1.51% (p=0.027 n=30+23) RegexpMatchHard_1K-4 1.08MB/s ± 4% 1.10MB/s ± 0% +1.69% (p=0.002 n=30+25) Revcomp-4 33.0MB/s ± 4% 33.0MB/s ± 4% ~ (p=0.272 n=29+30) Template-4 2.21MB/s ± 0% 2.21MB/s ± 0% ~ (all equal) [Geo mean] 7.75MB/s 7.77MB/s +0.29% 3. There is little regression in the compilecmp benchmark. name old time/op new time/op delta Template 2.28s ± 3% 2.28s ± 4% ~ (p=0.739 n=10+10) Unicode 1.34s ± 4% 1.32s ± 3% ~ (p=0.113 n=10+9) GoTypes 8.10s ± 3% 8.18s ± 3% ~ (p=0.393 n=10+10) Compiler 39.0s ± 3% 39.2s ± 3% ~ (p=0.393 n=10+10) SSA 114s ± 3% 115s ± 2% ~ (p=0.631 n=10+10) Flate 1.41s ± 2% 1.42s ± 3% ~ (p=0.353 n=10+10) GoParser 1.81s ± 1% 1.83s ± 2% ~ (p=0.211 n=10+9) Reflect 5.06s ± 2% 5.06s ± 2% ~ (p=0.912 n=10+10) Tar 2.19s ± 3% 2.20s ± 3% ~ (p=0.247 n=10+10) XML 2.65s ± 2% 2.67s ± 5% ~ (p=0.796 n=10+10) [Geo mean] 4.92s 4.93s +0.27% name old user-time/op new user-time/op delta Template 2.81s ± 2% 2.81s ± 3% ~ (p=0.971 n=10+10) Unicode 1.70s ± 3% 1.67s ± 5% ~ (p=0.315 n=10+10) GoTypes 9.71s ± 1% 9.78s ± 1% +0.71% (p=0.023 n=10+10) Compiler 47.3s ± 1% 47.1s ± 3% ~ (p=0.579 n=10+10) SSA 143s ± 2% 143s ± 2% ~ (p=0.280 n=10+10) Flate 1.70s ± 3% 1.71s ± 3% ~ (p=0.481 n=10+10) GoParser 2.21s ± 3% 2.21s ± 1% ~ (p=0.549 n=10+9) Reflect 5.89s ± 1% 5.87s ± 2% ~ (p=0.739 n=10+10) Tar 2.66s ± 2% 2.63s ± 2% ~ (p=0.105 n=10+10) XML 3.16s ± 3% 3.18s ± 2% ~ (p=0.143 n=10+10) [Geo mean] 5.97s 5.97s -0.06% name old text-bytes new text-bytes delta HelloSize 637kB ± 0% 637kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 9.46kB ± 0% 9.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 125kB ± 0% 125kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.24MB ± 0% 1.24MB ± 0% ~ (all equal) Change-Id: Ie27357d65c5ce9d07afdffebe1e2daadcaa3369f Reviewed-on: https://go-review.googlesource.com/97036 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-25 09:10:54 +00:00
{
fn: `
func $(x, y uint32) uint32 {
return x &^ y
}
`,
pos: []string{"\tBIC\t"},
neg: []string{"\tAND\t"},
},
{
fn: `
func $(x, y uint32) uint32 {
return x ^ ^y
}
`,
pos: []string{"\tEON\t"},
neg: []string{"\tXOR\t"},
},
{
fn: `
func $(x, y uint32) uint32 {
return x | ^y
}
`,
pos: []string{"\tORN\t"},
neg: []string{"\tORR\t"},
},
{
fn: `
func f34(a uint64) uint64 {
return a & ((1<<63)-1)
}
`,
pos: []string{"\tAND\t"},
},
{
fn: `
func f35(a uint64) uint64 {
return a & (1<<63)
}
`,
pos: []string{"\tAND\t"},
},
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64 ARM64 assembler backend only accepts loads and stores with small or aligned offset. The compiler therefore can only fold small or aligned offsets into loads and stores. For locals and args, their offsets to SP are not known until very late, and the compiler makes conservative decision not folding some of them. However, in most cases, the offset is indeed small or aligned, and can be folded into load and store (but actually not). This CL adds support of loads and stores with large and unaligned offsets. When the offset doesn't fit into the instruction, it uses two instructions and (for very large offset) the constant pool. This way, the compiler doesn't need to be conservative, and can simply fold the offset. To make it work, the assembler's optab matching rules need to be changed. Before, MOVD accepts C_UAUTO32K which matches multiple of 8 between 0 and 32K, and also C_UAUTO16K, which may not be multiple of 8 and does not fit into MOVD instruction. The assembler errors in the latter case. This change makes it only matches multiple of 8 (or offsets within ±256, which also fits in instruction), and uses the large-or-unaligned-offset rule for things doesn't fit (without error). Other sized move rules are changed similarly. Class C_UAUTO64K and C_UOREG64K are removed, as they are never used. In shared library, load/store of global is rewritten to using GOT and temp register, which conflicts with the use of temp register for assembling large offset. So the folding is disabled for globals in shared library mode. Reduce cmd/go binary size by 2%. name old time/op new time/op delta BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10) Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9) FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal) FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10) FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10) FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8) FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10) FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10) FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10) GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10) GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10) Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10) Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10) HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10) JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9) JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8) Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9) GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10) RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9) RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9) RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9) RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9) RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9) RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9) RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8) RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9) Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8) Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10) TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10) TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10) [Geo mean] 145µs 143µs -1.79% Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985 Reviewed-on: https://go-review.googlesource.com/42172 Run-TryBot: Cherry Zhang <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2017-04-28 18:02:00 -04:00
{
// make sure offsets are folded into load and store.
fn: `
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64 ARM64 assembler backend only accepts loads and stores with small or aligned offset. The compiler therefore can only fold small or aligned offsets into loads and stores. For locals and args, their offsets to SP are not known until very late, and the compiler makes conservative decision not folding some of them. However, in most cases, the offset is indeed small or aligned, and can be folded into load and store (but actually not). This CL adds support of loads and stores with large and unaligned offsets. When the offset doesn't fit into the instruction, it uses two instructions and (for very large offset) the constant pool. This way, the compiler doesn't need to be conservative, and can simply fold the offset. To make it work, the assembler's optab matching rules need to be changed. Before, MOVD accepts C_UAUTO32K which matches multiple of 8 between 0 and 32K, and also C_UAUTO16K, which may not be multiple of 8 and does not fit into MOVD instruction. The assembler errors in the latter case. This change makes it only matches multiple of 8 (or offsets within ±256, which also fits in instruction), and uses the large-or-unaligned-offset rule for things doesn't fit (without error). Other sized move rules are changed similarly. Class C_UAUTO64K and C_UOREG64K are removed, as they are never used. In shared library, load/store of global is rewritten to using GOT and temp register, which conflicts with the use of temp register for assembling large offset. So the folding is disabled for globals in shared library mode. Reduce cmd/go binary size by 2%. name old time/op new time/op delta BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10) Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9) FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal) FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10) FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10) FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8) FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10) FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10) FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10) GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10) GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10) Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10) Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10) HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10) JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9) JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8) Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9) GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10) RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9) RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9) RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9) RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9) RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9) RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9) RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8) RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9) Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8) Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10) TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10) TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10) [Geo mean] 145µs 143µs -1.79% Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985 Reviewed-on: https://go-review.googlesource.com/42172 Run-TryBot: Cherry Zhang <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2017-04-28 18:02:00 -04:00
func f36(_, a [20]byte) (b [20]byte) {
b = a
return
}
`,
pos: []string{"\tMOVD\t\"\"\\.a\\+[0-9]+\\(FP\\), R[0-9]+", "\tMOVD\tR[0-9]+, \"\"\\.b\\+[0-9]+\\(FP\\)"},
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64 ARM64 assembler backend only accepts loads and stores with small or aligned offset. The compiler therefore can only fold small or aligned offsets into loads and stores. For locals and args, their offsets to SP are not known until very late, and the compiler makes conservative decision not folding some of them. However, in most cases, the offset is indeed small or aligned, and can be folded into load and store (but actually not). This CL adds support of loads and stores with large and unaligned offsets. When the offset doesn't fit into the instruction, it uses two instructions and (for very large offset) the constant pool. This way, the compiler doesn't need to be conservative, and can simply fold the offset. To make it work, the assembler's optab matching rules need to be changed. Before, MOVD accepts C_UAUTO32K which matches multiple of 8 between 0 and 32K, and also C_UAUTO16K, which may not be multiple of 8 and does not fit into MOVD instruction. The assembler errors in the latter case. This change makes it only matches multiple of 8 (or offsets within ±256, which also fits in instruction), and uses the large-or-unaligned-offset rule for things doesn't fit (without error). Other sized move rules are changed similarly. Class C_UAUTO64K and C_UOREG64K are removed, as they are never used. In shared library, load/store of global is rewritten to using GOT and temp register, which conflicts with the use of temp register for assembling large offset. So the folding is disabled for globals in shared library mode. Reduce cmd/go binary size by 2%. name old time/op new time/op delta BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10) Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9) FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal) FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10) FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10) FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8) FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10) FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10) FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10) GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10) GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10) Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10) Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10) HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10) JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9) JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8) Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9) GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10) RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9) RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9) RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9) RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9) RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9) RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9) RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8) RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9) Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8) Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10) TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10) TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10) [Geo mean] 145µs 143µs -1.79% Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985 Reviewed-on: https://go-review.googlesource.com/42172 Run-TryBot: Cherry Zhang <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2017-04-28 18:02:00 -04:00
},
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]-8-8"},
},
{
// check that we don't emit comparisons for constant shift
fn: `
//go:nosplit
func $(x int) int {
return x << 17
}
`,
pos: []string{"LSL\t\\$17"},
neg: []string{"CMP"},
},
cmd/compile: generate tbz/tbnz when comparing against zero on arm64 The tbz/tbnz checks the sign bit to determine if the value is >= 0 or < 0. go1 benchmark results: name old speed new speed delta JSONEncode 94.4MB/s ± 1% 95.7MB/s ± 0% +1.36% (p=0.000 n=10+9) JSONDecode 19.7MB/s ± 1% 19.9MB/s ± 1% +1.08% (p=0.000 n=9+10) Gzip 45.5MB/s ± 0% 46.0MB/s ± 0% +1.06% (p=0.000 n=10+10) Revcomp 376MB/s ± 0% 379MB/s ± 0% +0.69% (p=0.000 n=10+10) RegexpMatchHard_1K 12.6MB/s ± 0% 12.7MB/s ± 0% +0.57% (p=0.000 n=10+8) RegexpMatchMedium_32 3.21MB/s ± 0% 3.22MB/s ± 0% +0.31% (p=0.000 n=9+10) RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% +0.23% (p=0.000 n=9+9) RegexpMatchHard_32 11.4MB/s ± 0% 11.4MB/s ± 1% +0.19% (p=0.036 n=10+8) RegexpMatchEasy0_1K 1.77GB/s ± 0% 1.77GB/s ± 0% +0.13% (p=0.000 n=9+10) RegexpMatchMedium_1K 19.3MB/s ± 0% 19.3MB/s ± 0% +0.04% (p=0.008 n=10+8) RegexpMatchEasy0_32 131MB/s ± 0% 131MB/s ± 0% ~ (p=0.211 n=10+10) GobDecode 57.5MB/s ± 1% 57.6MB/s ± 2% ~ (p=0.469 n=10+10) GobEncode 58.6MB/s ± 1% 58.5MB/s ± 2% ~ (p=0.781 n=10+10) GoParse 9.40MB/s ± 0% 9.39MB/s ± 0% -0.19% (p=0.005 n=10+9) RegexpMatchEasy1_32 133MB/s ± 0% 133MB/s ± 0% -0.48% (p=0.000 n=10+10) Template 20.9MB/s ± 0% 20.6MB/s ± 0% -1.54% (p=0.000 n=8+10) Change-Id: I411efe44db35c3962445618d5a47c12e31b3925b Reviewed-on: https://go-review.googlesource.com/92715 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-02-07 15:37:33 -05:00
{
fn: `
func $(a int32, ptr *int) {
if a >= 0 {
*ptr = 0
}
}
`,
pos: []string{"TBNZ"},
},
{
fn: `
func $(a int64, ptr *int) {
if a >= 0 {
*ptr = 0
}
}
`,
pos: []string{"TBNZ"},
},
{
fn: `
func $(a int32, ptr *int) {
if a < 0 {
*ptr = 0
}
}
`,
pos: []string{"TBZ"},
},
{
fn: `
func $(a int64, ptr *int) {
if a < 0 {
*ptr = 0
}
}
`,
pos: []string{"TBZ"},
},
cmd/compile: improve absorb shifts optimization for arm64 Current absorb shifts optimization can generate dead Value nodes which increase use count of other live nodes. It will impact other optimizations (such as combined loads) which are enabled based on specific use count. This patch fixes the issue by decreasing the use count of nodes referenced by dead Value nodes generated by absorb shifts optimization. Performance impacts on go1 benchmarks (data collected on A57@2GHzx8): name old time/op new time/op delta BinaryTree17-8 6.28s ± 2% 6.24s ± 1% ~ (p=0.065 n=10+9) Fannkuch11-8 6.32s ± 0% 6.33s ± 0% +0.17% (p=0.000 n=10+10) FmtFprintfEmpty-8 98.9ns ± 0% 99.2ns ± 0% +0.34% (p=0.000 n=9+7) FmtFprintfString-8 183ns ± 1% 182ns ± 1% -1.01% (p=0.005 n=9+10) FmtFprintfInt-8 199ns ± 1% 202ns ± 1% +1.41% (p=0.000 n=10+9) FmtFprintfIntInt-8 272ns ± 1% 276ns ± 3% +1.36% (p=0.015 n=10+10) FmtFprintfPrefixedInt-8 367ns ± 1% 369ns ± 1% +0.68% (p=0.042 n=10+10) FmtFprintfFloat-8 491ns ± 1% 493ns ± 1% ~ (p=0.064 n=10+10) FmtManyArgs-8 1.31µs ± 1% 1.32µs ± 1% +0.39% (p=0.042 n=8+9) GobDecode-8 17.0ms ± 2% 16.2ms ± 2% -4.74% (p=0.000 n=10+10) GobEncode-8 13.7ms ± 2% 13.4ms ± 1% -2.40% (p=0.000 n=10+9) Gzip-8 844ms ± 0% 737ms ± 0% -12.70% (p=0.000 n=10+10) Gunzip-8 84.4ms ± 1% 83.9ms ± 0% -0.55% (p=0.000 n=10+8) HTTPClientServer-8 122µs ± 1% 124µs ± 1% +1.75% (p=0.000 n=10+9) JSONEncode-8 34.9ms ± 1% 32.4ms ± 0% -7.11% (p=0.000 n=10+9) JSONDecode-8 150ms ± 0% 146ms ± 1% -2.84% (p=0.000 n=7+10) Mandelbrot200-8 10.0ms ± 0% 10.0ms ± 0% ~ (p=0.529 n=10+10) GoParse-8 8.18ms ± 1% 8.03ms ± 0% -1.93% (p=0.000 n=10+10) RegexpMatchEasy0_32-8 209ns ± 0% 209ns ± 0% ~ (p=0.248 n=10+9) RegexpMatchEasy0_1K-8 789ns ± 1% 790ns ± 0% ~ (p=0.361 n=10+10) RegexpMatchEasy1_32-8 202ns ± 0% 202ns ± 1% ~ (p=0.137 n=8+10) RegexpMatchEasy1_1K-8 1.12µs ± 2% 1.12µs ± 1% ~ (p=0.810 n=10+10) RegexpMatchMedium_32-8 298ns ± 0% 298ns ± 0% ~ (p=0.443 n=10+9) RegexpMatchMedium_1K-8 83.0µs ± 5% 78.6µs ± 0% -5.37% (p=0.000 n=10+10) RegexpMatchHard_32-8 4.32µs ± 0% 4.26µs ± 0% -1.47% (p=0.000 n=10+10) RegexpMatchHard_1K-8 132µs ± 4% 126µs ± 0% -4.41% (p=0.000 n=10+9) Revcomp-8 1.11s ± 0% 1.11s ± 0% +0.14% (p=0.017 n=10+9) Template-8 155ms ± 1% 155ms ± 1% ~ (p=0.796 n=10+10) TimeParse-8 774ns ± 1% 785ns ± 1% +1.41% (p=0.001 n=10+10) TimeFormat-8 788ns ± 1% 806ns ± 1% +2.24% (p=0.000 n=10+9) name old speed new speed delta GobDecode-8 45.2MB/s ± 2% 47.5MB/s ± 2% +4.96% (p=0.000 n=10+10) GobEncode-8 56.0MB/s ± 2% 57.4MB/s ± 1% +2.44% (p=0.000 n=10+9) Gzip-8 23.0MB/s ± 0% 26.3MB/s ± 0% +14.55% (p=0.000 n=10+10) Gunzip-8 230MB/s ± 1% 231MB/s ± 0% +0.55% (p=0.000 n=10+8) JSONEncode-8 55.6MB/s ± 1% 59.9MB/s ± 0% +7.65% (p=0.000 n=10+9) JSONDecode-8 12.9MB/s ± 0% 13.3MB/s ± 1% +2.94% (p=0.000 n=7+10) GoParse-8 7.08MB/s ± 1% 7.22MB/s ± 0% +1.95% (p=0.000 n=10+10) RegexpMatchEasy0_32-8 153MB/s ± 0% 153MB/s ± 0% -0.16% (p=0.023 n=10+10) RegexpMatchEasy0_1K-8 1.30GB/s ± 1% 1.30GB/s ± 0% ~ (p=0.393 n=10+10) RegexpMatchEasy1_32-8 158MB/s ± 0% 158MB/s ± 0% ~ (p=0.684 n=10+10) RegexpMatchEasy1_1K-8 915MB/s ± 2% 918MB/s ± 1% ~ (p=0.796 n=10+10) RegexpMatchMedium_32-8 3.35MB/s ± 0% 3.35MB/s ± 0% ~ (p=1.000 n=10+9) RegexpMatchMedium_1K-8 12.3MB/s ± 5% 13.0MB/s ± 0% +5.56% (p=0.000 n=10+10) RegexpMatchHard_32-8 7.40MB/s ± 0% 7.51MB/s ± 0% +1.50% (p=0.000 n=10+10) RegexpMatchHard_1K-8 7.75MB/s ± 4% 8.10MB/s ± 0% +4.52% (p=0.000 n=10+8) Revcomp-8 229MB/s ± 0% 228MB/s ± 0% -0.14% (p=0.017 n=10+9) Template-8 12.5MB/s ± 1% 12.5MB/s ± 1% ~ (p=0.780 n=10+10) Change-Id: I103389f168eac79f6af44e8fef93acc2a7a4ac96 Reviewed-on: https://go-review.googlesource.com/88415 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-02-15 14:49:03 -05:00
// Load-combining tests.
{
fn: `
func $(s []byte) uint16 {
return uint16(s[0]) | uint16(s[1]) << 8
}
`,
pos: []string{"\tMOVHU\t\\(R[0-9]+\\)"},
neg: []string{"ORR\tR[0-9]+<<8\t"},
},
cmd/compile/internal/ssa: emit csel on arm64 Introduce a new SSA pass to generate CondSelect intstrutions, and add CondSelect lowering rules for arm64. In order to make the CSEL instruction easier to optimize, and to simplify the introduction of CSNEG, CSINC, and CSINV in the future, modify the CSEL instruction to accept a condition code in the aux field. Notably, this change makes the go1 Gzip benchmark more than 10% faster. Benchmarks on a Cavium ThunderX: name old time/op new time/op delta BinaryTree17-96 15.9s ± 6% 16.0s ± 4% ~ (p=0.968 n=10+9) Fannkuch11-96 7.17s ± 0% 7.00s ± 0% -2.43% (p=0.000 n=8+9) FmtFprintfEmpty-96 208ns ± 1% 207ns ± 0% ~ (p=0.152 n=10+8) FmtFprintfString-96 379ns ± 0% 375ns ± 0% -0.95% (p=0.000 n=10+9) FmtFprintfInt-96 385ns ± 0% 383ns ± 0% -0.52% (p=0.000 n=9+10) FmtFprintfIntInt-96 591ns ± 0% 586ns ± 0% -0.85% (p=0.006 n=7+9) FmtFprintfPrefixedInt-96 656ns ± 0% 667ns ± 0% +1.71% (p=0.000 n=10+10) FmtFprintfFloat-96 967ns ± 0% 984ns ± 0% +1.78% (p=0.000 n=10+10) FmtManyArgs-96 2.35µs ± 0% 2.25µs ± 0% -4.63% (p=0.000 n=9+8) GobDecode-96 31.0ms ± 0% 30.8ms ± 0% -0.36% (p=0.006 n=9+9) GobEncode-96 24.4ms ± 0% 24.5ms ± 0% +0.30% (p=0.000 n=9+9) Gzip-96 1.60s ± 0% 1.43s ± 0% -10.58% (p=0.000 n=9+10) Gunzip-96 167ms ± 0% 169ms ± 0% +0.83% (p=0.000 n=8+9) HTTPClientServer-96 311µs ± 1% 308µs ± 0% -0.75% (p=0.000 n=10+10) JSONEncode-96 65.0ms ± 0% 64.8ms ± 0% -0.25% (p=0.000 n=9+8) JSONDecode-96 262ms ± 1% 261ms ± 1% ~ (p=0.579 n=10+10) Mandelbrot200-96 18.0ms ± 0% 18.1ms ± 0% +0.17% (p=0.000 n=8+10) GoParse-96 14.0ms ± 0% 14.1ms ± 1% +0.42% (p=0.003 n=9+10) RegexpMatchEasy0_32-96 644ns ± 2% 645ns ± 2% ~ (p=0.836 n=10+10) RegexpMatchEasy0_1K-96 3.70µs ± 0% 3.49µs ± 0% -5.58% (p=0.000 n=10+10) RegexpMatchEasy1_32-96 662ns ± 2% 657ns ± 2% ~ (p=0.137 n=10+10) RegexpMatchEasy1_1K-96 4.47µs ± 0% 4.31µs ± 0% -3.48% (p=0.000 n=10+10) RegexpMatchMedium_32-96 844ns ± 2% 849ns ± 1% ~ (p=0.208 n=10+10) RegexpMatchMedium_1K-96 179µs ± 0% 182µs ± 0% +1.20% (p=0.000 n=10+10) RegexpMatchHard_32-96 10.0µs ± 0% 10.1µs ± 0% +0.48% (p=0.000 n=10+9) RegexpMatchHard_1K-96 297µs ± 0% 297µs ± 0% -0.14% (p=0.000 n=10+10) Revcomp-96 3.08s ± 0% 3.13s ± 0% +1.56% (p=0.000 n=9+9) Template-96 276ms ± 2% 275ms ± 1% ~ (p=0.393 n=10+10) TimeParse-96 1.37µs ± 0% 1.36µs ± 0% -0.53% (p=0.000 n=10+7) TimeFormat-96 1.40µs ± 0% 1.42µs ± 0% +0.97% (p=0.000 n=10+10) [Geo mean] 264µs 262µs -0.77% Change-Id: Ie54eee4b3092af53e6da3baa6d1755098f57f3a2 Reviewed-on: https://go-review.googlesource.com/55670 Run-TryBot: Philip Hofer <phofer@umich.edu> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-13 22:36:47 +00:00
{
// make sure that CSEL is emitted for conditional moves
fn: `
func f37(c int) int {
x := c + 4
if c < 0 {
x = 182
}
return x
}
`,
pos: []string{"\tCSEL\t"},
},
cmd/compile/internal/ssa: combine zero stores into larger stores on arm64 This reduces the go tool binary on arm64 by 12k. go1 results on Amberwing: name old time/op new time/op delta RegexpMatchEasy0_32 249ns ± 0% 249ns ± 0% ~ (p=0.087 n=10+10) RegexpMatchEasy0_1K 584ns ± 0% 584ns ± 0% ~ (all equal) RegexpMatchEasy1_32 246ns ± 0% 246ns ± 0% ~ (p=1.000 n=10+10) RegexpMatchEasy1_1K 806ns ± 0% 806ns ± 0% ~ (p=0.706 n=10+9) RegexpMatchMedium_32 314ns ± 0% 314ns ± 0% ~ (all equal) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% ~ (p=0.245 n=10+8) RegexpMatchHard_32 2.75µs ± 1% 2.75µs ± 1% ~ (p=0.690 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 78.9µs ± 1% ~ (p=0.295 n=9+9) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) FmtFprintfString 112ns ± 0% 112ns ± 0% ~ (all equal) FmtFprintfInt 117ns ± 0% 116ns ± 0% -0.85% (p=0.000 n=10+10) FmtFprintfIntInt 181ns ± 0% 181ns ± 0% ~ (all equal) FmtFprintfPrefixedInt 222ns ± 0% 224ns ± 0% +0.90% (p=0.000 n=9+10) FmtFprintfFloat 318ns ± 1% 322ns ± 0% ~ (p=0.059 n=10+8) FmtManyArgs 736ns ± 1% 735ns ± 0% ~ (p=0.206 n=9+9) Gzip 437ms ± 0% 436ms ± 0% -0.25% (p=0.000 n=10+10) HTTPClientServer 89.8µs ± 1% 90.2µs ± 2% ~ (p=0.393 n=10+10) JSONEncode 20.1ms ± 1% 20.2ms ± 1% ~ (p=0.065 n=9+10) JSONDecode 94.2ms ± 1% 93.9ms ± 1% -0.42% (p=0.043 n=10+10) GobDecode 12.7ms ± 1% 12.8ms ± 2% +0.94% (p=0.019 n=10+10) GobEncode 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.052 n=10+10) Mandelbrot200 5.06ms ± 0% 5.05ms ± 0% -0.04% (p=0.000 n=9+10) TimeParse 450ns ± 3% 446ns ± 0% ~ (p=0.238 n=10+9) TimeFormat 485ns ± 1% 483ns ± 1% ~ (p=0.073 n=10+10) Template 90.4ms ± 0% 90.7ms ± 0% +0.29% (p=0.000 n=8+10) GoParse 6.01ms ± 0% 6.03ms ± 0% +0.35% (p=0.000 n=10+10) BinaryTree17 11.7s ± 0% 11.7s ± 0% ~ (p=0.481 n=10+10) Revcomp 669ms ± 0% 669ms ± 0% ~ (p=0.315 n=10+10) Fannkuch11 3.40s ± 0% 3.37s ± 0% -0.92% (p=0.000 n=10+10) [Geo mean] 67.9µs 67.9µs +0.02% name old speed new speed delta RegexpMatchEasy0_32 128MB/s ± 0% 128MB/s ± 0% -0.08% (p=0.003 n=8+10) RegexpMatchEasy0_1K 1.75GB/s ± 0% 1.75GB/s ± 0% ~ (p=0.642 n=8+10) RegexpMatchEasy1_32 130MB/s ± 0% 130MB/s ± 0% ~ (p=0.690 n=10+9) RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% ~ (p=0.661 n=10+9) RegexpMatchMedium_32 3.18MB/s ± 0% 3.18MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K 19.7MB/s ± 0% 19.6MB/s ± 0% ~ (p=0.190 n=10+9) RegexpMatchHard_32 11.6MB/s ± 0% 11.6MB/s ± 1% ~ (p=0.669 n=10+10) RegexpMatchHard_1K 13.0MB/s ± 0% 13.0MB/s ± 0% ~ (p=0.718 n=9+9) Gzip 44.4MB/s ± 0% 44.5MB/s ± 0% +0.24% (p=0.000 n=10+10) JSONEncode 96.5MB/s ± 1% 96.1MB/s ± 1% ~ (p=0.065 n=9+10) JSONDecode 20.6MB/s ± 1% 20.7MB/s ± 1% +0.42% (p=0.041 n=10+10) GobDecode 60.6MB/s ± 1% 60.0MB/s ± 2% -0.92% (p=0.016 n=10+10) GobEncode 63.4MB/s ± 0% 63.6MB/s ± 0% ~ (p=0.055 n=10+10) Template 21.5MB/s ± 0% 21.4MB/s ± 0% -0.30% (p=0.000 n=9+10) GoParse 9.64MB/s ± 0% 9.61MB/s ± 0% -0.36% (p=0.000 n=10+10) Revcomp 380MB/s ± 0% 380MB/s ± 0% ~ (p=0.323 n=10+10) [Geo mean] 56.0MB/s 55.9MB/s -0.07% Change-Id: Ia732fa57fbcf4767d72382516d9f16705d177736 Reviewed-on: https://go-review.googlesource.com/96435 Run-TryBot: Cherry Zhang <cherryyz@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-02-23 15:17:54 -05:00
// Check that zero stores are combine into larger stores
{
fn: `
func $(b []byte) {
_ = b[1] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
}
`,
pos: []string{"MOVH\tZR"},
neg: []string{"MOVB"},
},
{
fn: `
func $(b []byte) {
_ = b[1] // early bounds check to guarantee safety of writes below
b[1] = 0
b[0] = 0
}
`,
pos: []string{"MOVH\tZR"},
neg: []string{"MOVB"},
},
{
fn: `
func $(b []byte) {
_ = b[3] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
b[2] = 0
b[3] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(b []byte) {
_ = b[3] // early bounds check to guarantee safety of writes below
b[2] = 0
b[3] = 0
b[1] = 0
b[0] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(h []uint16) {
_ = h[1] // early bounds check to guarantee safety of writes below
h[0] = 0
h[1] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(h []uint16) {
_ = h[1] // early bounds check to guarantee safety of writes below
h[1] = 0
h[0] = 0
}
`,
pos: []string{"MOVW\tZR"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(b []byte) {
_ = b[7] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
b[2] = 0
b[3] = 0
b[4] = 0
b[5] = 0
b[6] = 0
b[7] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(h []uint16) {
_ = h[3] // early bounds check to guarantee safety of writes below
h[0] = 0
h[1] = 0
h[2] = 0
h[3] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(h []uint16) {
_ = h[3] // early bounds check to guarantee safety of writes below
h[2] = 0
h[3] = 0
h[1] = 0
h[0] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(w []uint32) {
_ = w[1] // early bounds check to guarantee safety of writes below
w[0] = 0
w[1] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(w []uint32) {
_ = w[1] // early bounds check to guarantee safety of writes below
w[1] = 0
w[0] = 0
}
`,
pos: []string{"MOVD\tZR"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(b []byte) {
_ = b[15] // early bounds check to guarantee safety of writes below
b[0] = 0
b[1] = 0
b[2] = 0
b[3] = 0
b[4] = 0
b[5] = 0
b[6] = 0
b[7] = 0
b[8] = 0
b[9] = 0
b[10] = 0
b[11] = 0
b[12] = 0
b[13] = 0
b[15] = 0
b[14] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(h []uint16) {
_ = h[7] // early bounds check to guarantee safety of writes below
h[0] = 0
h[1] = 0
h[2] = 0
h[3] = 0
h[4] = 0
h[5] = 0
h[6] = 0
h[7] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(w []uint32) {
_ = w[3] // early bounds check to guarantee safety of writes below
w[0] = 0
w[1] = 0
w[2] = 0
w[3] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(w []uint32) {
_ = w[3] // early bounds check to guarantee safety of writes below
w[1] = 0
w[0] = 0
w[3] = 0
w[2] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(d []uint64) {
_ = d[1] // early bounds check to guarantee safety of writes below
d[0] = 0
d[1] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
{
fn: `
func $(d []uint64) {
_ = d[1] // early bounds check to guarantee safety of writes below
d[1] = 0
d[0] = 0
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH"},
},
cmd/compile: improve fractional word zeroing This change improves fractional word zeroing by using overlapping MOVDs for the fractions. Performance of go1 benchmarks on Amberwing was all noise: name old time/op new time/op delta RegexpMatchEasy0_32 247ns ± 0% 246ns ± 0% -0.40% (p=0.008 n=5+5) RegexpMatchEasy0_1K 581ns ± 0% 579ns ± 0% -0.34% (p=0.000 n=5+4) RegexpMatchEasy1_32 244ns ± 0% 242ns ± 0% ~ (p=0.079 n=4+5) RegexpMatchEasy1_1K 804ns ± 0% 805ns ± 0% ~ (p=0.238 n=5+4) RegexpMatchMedium_32 313ns ± 0% 311ns ± 0% -0.64% (p=0.008 n=5+5) RegexpMatchMedium_1K 52.2µs ± 0% 51.9µs ± 0% -0.52% (p=0.016 n=5+4) RegexpMatchHard_32 2.75µs ± 0% 2.74µs ± 0% ~ (p=0.603 n=5+5) RegexpMatchHard_1K 78.8µs ± 0% 78.9µs ± 0% +0.05% (p=0.008 n=5+5) FmtFprintfEmpty 58.6ns ± 0% 58.6ns ± 0% ~ (p=0.159 n=5+5) FmtFprintfString 118ns ± 0% 119ns ± 0% +0.85% (p=0.008 n=5+5) FmtFprintfInt 119ns ± 0% 123ns ± 0% +3.36% (p=0.016 n=5+4) FmtFprintfIntInt 192ns ± 0% 200ns ± 0% +4.17% (p=0.008 n=5+5) FmtFprintfPrefixedInt 224ns ± 0% 209ns ± 0% -6.70% (p=0.008 n=5+5) FmtFprintfFloat 335ns ± 0% 335ns ± 0% ~ (all equal) FmtManyArgs 775ns ± 0% 811ns ± 1% +4.67% (p=0.016 n=4+5) Gzip 437ms ± 0% 438ms ± 0% +0.19% (p=0.008 n=5+5) HTTPClientServer 88.7µs ± 1% 90.3µs ± 1% +1.75% (p=0.016 n=5+5) JSONEncode 20.1ms ± 1% 20.1ms ± 0% ~ (p=1.000 n=5+5) JSONDecode 94.7ms ± 1% 94.8ms ± 1% ~ (p=0.548 n=5+5) GobDecode 12.8ms ± 1% 12.8ms ± 1% ~ (p=0.548 n=5+5) GobEncode 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.151 n=5+5) Mandelbrot200 5.37ms ± 0% 5.37ms ± 0% -0.03% (p=0.008 n=5+5) TimeParse 450ns ± 0% 451ns ± 1% ~ (p=0.635 n=4+5) TimeFormat 485ns ± 0% 484ns ± 0% ~ (p=0.508 n=5+5) Template 90.4ms ± 0% 90.2ms ± 0% -0.24% (p=0.016 n=5+5) GoParse 5.98ms ± 0% 5.98ms ± 0% ~ (p=1.000 n=5+5) BinaryTree17 11.8s ± 0% 11.8s ± 0% ~ (p=0.841 n=5+5) Revcomp 669ms ± 0% 669ms ± 0% ~ (p=0.310 n=5+5) Fannkuch11 3.28s ± 0% 3.34s ± 0% +1.64% (p=0.008 n=5+5) name old speed new speed delta RegexpMatchEasy0_32 129MB/s ± 0% 130MB/s ± 0% +0.30% (p=0.016 n=4+5) RegexpMatchEasy0_1K 1.76GB/s ± 0% 1.77GB/s ± 0% +0.27% (p=0.016 n=5+4) RegexpMatchEasy1_32 131MB/s ± 0% 132MB/s ± 0% +0.71% (p=0.016 n=4+5) RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% -0.17% (p=0.016 n=5+4) RegexpMatchMedium_32 3.19MB/s ± 0% 3.21MB/s ± 0% +0.63% (p=0.008 n=5+5) RegexpMatchMedium_1K 19.6MB/s ± 0% 19.7MB/s ± 0% +0.52% (p=0.016 n=5+4) RegexpMatchHard_32 11.7MB/s ± 0% 11.7MB/s ± 0% ~ (p=0.643 n=5+5) RegexpMatchHard_1K 13.0MB/s ± 0% 13.0MB/s ± 0% ~ (p=0.079 n=4+5) Gzip 44.4MB/s ± 0% 44.3MB/s ± 0% -0.19% (p=0.008 n=5+5) JSONEncode 96.3MB/s ± 1% 96.4MB/s ± 0% ~ (p=1.000 n=5+5) JSONDecode 20.5MB/s ± 1% 20.5MB/s ± 1% ~ (p=0.460 n=5+5) GobDecode 60.1MB/s ± 1% 59.9MB/s ± 1% ~ (p=0.548 n=5+5) GobEncode 63.5MB/s ± 0% 63.7MB/s ± 0% ~ (p=0.135 n=5+5) Template 21.5MB/s ± 0% 21.5MB/s ± 0% +0.24% (p=0.016 n=5+5) GoParse 9.68MB/s ± 0% 9.69MB/s ± 0% ~ (p=0.786 n=5+5) Revcomp 380MB/s ± 0% 380MB/s ± 0% ~ (p=0.310 n=5+5) Change-Id: I596eee6421cdbad1a0189cdb9fe0628bba534eaf Reviewed-on: https://go-review.googlesource.com/96775 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-02-23 13:28:48 -05:00
{
fn: `
func $(a *[39]byte) {
*a = [39]byte{}
}
`,
pos: []string{"MOVD"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
{
fn: `
func $(a *[30]byte) {
*a = [30]byte{}
}
`,
pos: []string{"STP"},
neg: []string{"MOVB", "MOVH", "MOVW"},
},
}
var linuxMIPSTests = []*asmTest{
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]-4-4"},
},
}
var linuxMIPS64Tests = []*asmTest{
{
// check that we don't emit comparisons for constant shift
fn: `
func $(x int) int {
return x << 17
}
`,
pos: []string{"SLLV\t\\$17"},
neg: []string{"SGT"},
},
}
var linuxPPC64LETests = []*asmTest{
{
// check that stack store is optimized away
fn: `
func $() int {
var x int
return *(&x)
}
`,
pos: []string{"TEXT\t.*, [$]0-8"},
},
}
var plan9AMD64Tests = []*asmTest{
// We should make sure that the compiler doesn't generate floating point
// instructions for non-float operations on Plan 9, because floating point
// operations are not allowed in the note handler.
// Array zeroing.
{
fn: `
func $() [16]byte {
var a [16]byte
return a
}
`,
pos: []string{"\tMOVQ\t\\$0, \"\""},
},
// Array copy.
{
fn: `
func $(a [16]byte) (b [16]byte) {
b = a
return
}
`,
pos: []string{"\tMOVQ\t\"\"\\.a\\+[0-9]+\\(SP\\), (AX|CX)", "\tMOVQ\t(AX|CX), \"\"\\.b\\+[0-9]+\\(SP\\)"},
},
}
// TestLineNumber checks to make sure the generated assembly has line numbers
// see issue #16214
func TestLineNumber(t *testing.T) {
testenv.MustHaveGoBuild(t)
dir, err := ioutil.TempDir("", "TestLineNumber")
if err != nil {
t.Fatalf("could not create directory: %v", err)
}
defer os.RemoveAll(dir)
src := filepath.Join(dir, "x.go")
err = ioutil.WriteFile(src, []byte(issue16214src), 0644)
if err != nil {
t.Fatalf("could not write file: %v", err)
}
cmd := exec.Command(testenv.GoToolPath(t), "tool", "compile", "-S", "-o", filepath.Join(dir, "out.o"), src)
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("fail to run go tool compile: %v", err)
}
if strings.Contains(string(out), "unknown line number") {
t.Errorf("line number missing in assembly:\n%s", out)
}
}
var issue16214src = `
package main
func Mod32(x uint32) uint32 {
return x % 3 // frontend rewrites it as HMUL with 2863311531, the LITERAL node has unknown Pos
}
cmd/compile/internal/ssa: combine consecutive LittleEndian stores on arm64 This optimization mirrors that which is already implemented for AMD64. The optimization specifically targets the binary.LittleEndian.PutUint* functions. encoding/binary results on Amberwing: name old time/op new time/op delta ReadSlice1000Int32s 9.67µs ± 1% 9.64µs ± 1% ~ (p=0.185 n=9+9) ReadStruct 5.24µs ± 2% 5.36µs ± 2% +2.24% (p=0.002 n=10+8) ReadInts 8.69µs ± 5% 8.88µs ± 5% ~ (p=0.083 n=10+10) WriteInts 3.90µs ±10% 3.71µs ± 9% ~ (p=0.077 n=10+10) WriteSlice1000Int32s 10.9µs ± 1% 10.9µs ± 1% ~ (p=0.701 n=9+9) PutUint16 572ns ±14% 505ns ±11% -11.75% (p=0.006 n=9+10) PutUint32 550ns ±18% 540ns ±11% ~ (p=0.692 n=10+10) PutUint64 565ns ±15% 540ns ±17% ~ (p=0.248 n=10+10) LittleEndianPutUint16 540ns ±11% 500ns ±10% ~ (p=0.094 n=10+10) LittleEndianPutUint32 520ns ±15% 480ns ±15% ~ (p=0.087 n=10+10) LittleEndianPutUint64 505ns ±29% 470ns ±17% ~ (p=0.208 n=10+10) PutUvarint32 700ns ±21% 635ns ±10% -9.29% (p=0.028 n=10+10) PutUvarint64 740ns ± 8% 740ns ± 8% ~ (p=0.713 n=10+10) [Geo mean] 1.53µs 1.47µs -3.93% name old speed new speed delta ReadSlice1000Int32s 414MB/s ± 1% 415MB/s ± 1% ~ (p=0.185 n=9+9) ReadStruct 14.3MB/s ± 2% 14.0MB/s ± 2% -2.21% (p=0.000 n=10+8) ReadInts 3.45MB/s ± 4% 3.38MB/s ± 6% ~ (p=0.085 n=10+10) WriteInts 7.71MB/s ± 9% 8.09MB/s ± 8% +4.93% (p=0.048 n=10+10) WriteSlice1000Int32s 367MB/s ± 1% 366MB/s ± 1% ~ (p=0.701 n=9+9) PutUint16 3.51MB/s ±14% 3.99MB/s ±11% +13.47% (p=0.009 n=9+10) PutUint32 7.35MB/s ±21% 7.44MB/s ±10% ~ (p=0.692 n=10+10) PutUint64 14.3MB/s ±14% 15.0MB/s ±19% ~ (p=0.248 n=10+10) LittleEndianPutUint16 3.72MB/s ±11% 4.03MB/s ±10% ~ (p=0.094 n=10+10) LittleEndianPutUint32 7.75MB/s ±15% 8.39MB/s ±13% ~ (p=0.087 n=10+10) LittleEndianPutUint64 16.1MB/s ±23% 17.2MB/s ±16% ~ (p=0.208 n=10+10) PutUvarint32 5.76MB/s ±18% 6.32MB/s ±10% +9.72% (p=0.028 n=10+10) PutUvarint64 10.8MB/s ± 8% 10.8MB/s ± 8% ~ (p=0.713 n=10+10) [Geo mean] 13.7MB/s 14.3MB/s +4.02% go1 results on Amberwing: name old time/op new time/op delta RegexpMatchEasy0_32 249ns ± 0% 249ns ± 0% ~ (p=0.087 n=10+10) RegexpMatchEasy0_1K 584ns ± 0% 584ns ± 0% ~ (all equal) RegexpMatchEasy1_32 246ns ± 0% 246ns ± 0% ~ (p=1.000 n=10+10) RegexpMatchEasy1_1K 806ns ± 0% 806ns ± 0% ~ (p=0.706 n=10+9) RegexpMatchMedium_32 314ns ± 0% 314ns ± 0% ~ (all equal) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% ~ (p=0.245 n=10+8) RegexpMatchHard_32 2.75µs ± 1% 2.75µs ± 1% ~ (p=0.690 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 78.9µs ± 1% ~ (p=0.295 n=9+9) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) FmtFprintfString 112ns ± 0% 112ns ± 0% ~ (all equal) FmtFprintfInt 117ns ± 0% 116ns ± 0% -0.85% (p=0.000 n=10+10) FmtFprintfIntInt 181ns ± 0% 181ns ± 0% ~ (all equal) FmtFprintfPrefixedInt 222ns ± 0% 224ns ± 0% +0.90% (p=0.000 n=9+10) FmtFprintfFloat 318ns ± 1% 322ns ± 0% ~ (p=0.059 n=10+8) FmtManyArgs 736ns ± 1% 735ns ± 0% ~ (p=0.206 n=9+9) Gzip 437ms ± 0% 436ms ± 0% -0.25% (p=0.000 n=10+10) HTTPClientServer 89.8µs ± 1% 90.2µs ± 2% ~ (p=0.393 n=10+10) JSONEncode 20.1ms ± 1% 20.2ms ± 1% ~ (p=0.065 n=9+10) JSONDecode 94.2ms ± 1% 93.9ms ± 1% -0.42% (p=0.043 n=10+10) GobDecode 12.7ms ± 1% 12.8ms ± 2% +0.94% (p=0.019 n=10+10) GobEncode 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.052 n=10+10) Mandelbrot200 5.06ms ± 0% 5.05ms ± 0% -0.04% (p=0.000 n=9+10) TimeParse 450ns ± 3% 446ns ± 0% ~ (p=0.238 n=10+9) TimeFormat 485ns ± 1% 483ns ± 1% ~ (p=0.073 n=10+10) Template 90.4ms ± 0% 90.7ms ± 0% +0.29% (p=0.000 n=8+10) GoParse 6.01ms ± 0% 6.03ms ± 0% +0.35% (p=0.000 n=10+10) BinaryTree17 11.7s ± 0% 11.7s ± 0% ~ (p=0.481 n=10+10) Revcomp 669ms ± 0% 669ms ± 0% ~ (p=0.315 n=10+10) Fannkuch11 3.40s ± 0% 3.37s ± 0% -0.92% (p=0.000 n=10+10) [Geo mean] 67.9µs 67.9µs +0.02% name old speed new speed delta RegexpMatchEasy0_32 128MB/s ± 0% 128MB/s ± 0% -0.08% (p=0.003 n=8+10) RegexpMatchEasy0_1K 1.75GB/s ± 0% 1.75GB/s ± 0% ~ (p=0.642 n=8+10) RegexpMatchEasy1_32 130MB/s ± 0% 130MB/s ± 0% ~ (p=0.690 n=10+9) RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% ~ (p=0.661 n=10+9) RegexpMatchMedium_32 3.18MB/s ± 0% 3.18MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K 19.7MB/s ± 0% 19.6MB/s ± 0% ~ (p=0.190 n=10+9) RegexpMatchHard_32 11.6MB/s ± 0% 11.6MB/s ± 1% ~ (p=0.669 n=10+10) RegexpMatchHard_1K 13.0MB/s ± 0% 13.0MB/s ± 0% ~ (p=0.718 n=9+9) Gzip 44.4MB/s ± 0% 44.5MB/s ± 0% +0.24% (p=0.000 n=10+10) JSONEncode 96.5MB/s ± 1% 96.1MB/s ± 1% ~ (p=0.065 n=9+10) JSONDecode 20.6MB/s ± 1% 20.7MB/s ± 1% +0.42% (p=0.041 n=10+10) GobDecode 60.6MB/s ± 1% 60.0MB/s ± 2% -0.92% (p=0.016 n=10+10) GobEncode 63.4MB/s ± 0% 63.6MB/s ± 0% ~ (p=0.055 n=10+10) Template 21.5MB/s ± 0% 21.4MB/s ± 0% -0.30% (p=0.000 n=9+10) GoParse 9.64MB/s ± 0% 9.61MB/s ± 0% -0.36% (p=0.000 n=10+10) Revcomp 380MB/s ± 0% 380MB/s ± 0% ~ (p=0.323 n=10+10) [Geo mean] 56.0MB/s 55.9MB/s -0.07% Change-Id: I79a4978d42d01a5f72ed5ceec07f5e78ac6b3859 Reviewed-on: https://go-review.googlesource.com/97175 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-02-27 10:35:17 -05:00
`