2016-05-24 14:09:02 -07:00
|
|
|
// Copyright 2016 The Go Authors. All rights reserved.
|
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
package gc
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bytes"
|
|
|
|
|
"fmt"
|
|
|
|
|
"internal/testenv"
|
|
|
|
|
"io/ioutil"
|
|
|
|
|
"os"
|
|
|
|
|
"os/exec"
|
|
|
|
|
"path/filepath"
|
|
|
|
|
"regexp"
|
|
|
|
|
"runtime"
|
|
|
|
|
"strings"
|
|
|
|
|
"testing"
|
|
|
|
|
)
|
|
|
|
|
|
2017-08-17 21:20:25 +02:00
|
|
|
// This file contains code generation tests.
|
|
|
|
|
//
|
|
|
|
|
// Each test is defined in a variable of type asmTest. Tests are
|
|
|
|
|
// architecture-specific, and they are grouped in arrays of tests, one
|
|
|
|
|
// for each architecture.
|
|
|
|
|
//
|
2017-08-18 14:03:33 -05:00
|
|
|
// Each asmTest consists of a function to compile, an array of
|
2017-08-28 09:55:18 -07:00
|
|
|
// positive regexps that must match the generated assembly and
|
|
|
|
|
// an array of negative regexps that must not match generated assembly.
|
2017-08-18 14:03:33 -05:00
|
|
|
// For example, the following amd64 test
|
2017-08-17 21:20:25 +02:00
|
|
|
//
|
|
|
|
|
// {
|
2017-08-28 09:55:18 -07:00
|
|
|
// fn: `
|
2017-08-17 21:20:25 +02:00
|
|
|
// func f0(x int) int {
|
|
|
|
|
// return x * 64
|
|
|
|
|
// }
|
|
|
|
|
// `,
|
2017-08-28 09:55:18 -07:00
|
|
|
// pos: []string{"\tSHLQ\t[$]6,"},
|
|
|
|
|
// neg: []string{"MULQ"}
|
2017-08-17 21:20:25 +02:00
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// verifies that the code the compiler generates for a multiplication
|
2017-08-18 14:03:33 -05:00
|
|
|
// by 64 contains a 'SHLQ' instruction and does not contain a MULQ.
|
2017-08-17 21:20:25 +02:00
|
|
|
//
|
|
|
|
|
// Since all the tests for a given architecture are dumped in the same
|
|
|
|
|
// file, the function names must be unique. As a workaround for this
|
|
|
|
|
// restriction, the test harness supports the use of a '$' placeholder
|
|
|
|
|
// for function names. The func f0 above can be also written as
|
|
|
|
|
//
|
|
|
|
|
// {
|
2017-08-28 09:55:18 -07:00
|
|
|
// fn: `
|
2017-08-17 21:20:25 +02:00
|
|
|
// func $(x int) int {
|
|
|
|
|
// return x * 64
|
|
|
|
|
// }
|
|
|
|
|
// `,
|
2017-08-28 09:55:18 -07:00
|
|
|
// pos: []string{"\tSHLQ\t[$]6,"},
|
|
|
|
|
// neg: []string{"MULQ"}
|
2017-08-17 21:20:25 +02:00
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// Each '$'-function will be given a unique name of form f<N>_<arch>,
|
|
|
|
|
// where <N> is the test index in the test array, and <arch> is the
|
|
|
|
|
// test's architecture.
|
|
|
|
|
//
|
|
|
|
|
// It is allowed to mix named and unnamed functions in the same test
|
2017-08-28 09:55:18 -07:00
|
|
|
// array; the named functions will retain their original names.
|
2017-08-17 21:20:25 +02:00
|
|
|
|
2016-05-24 14:09:02 -07:00
|
|
|
// TestAssembly checks to make sure the assembly generated for
|
|
|
|
|
// functions contains certain expected instructions.
|
|
|
|
|
func TestAssembly(t *testing.T) {
|
|
|
|
|
testenv.MustHaveGoBuild(t)
|
|
|
|
|
if runtime.GOOS == "windows" {
|
|
|
|
|
// TODO: remove if we can get "go tool compile -S" to work on windows.
|
|
|
|
|
t.Skipf("skipping test: recursive windows compile not working")
|
|
|
|
|
}
|
|
|
|
|
dir, err := ioutil.TempDir("", "TestAssembly")
|
|
|
|
|
if err != nil {
|
|
|
|
|
t.Fatalf("could not create directory: %v", err)
|
|
|
|
|
}
|
|
|
|
|
defer os.RemoveAll(dir)
|
|
|
|
|
|
2017-03-18 11:16:30 -07:00
|
|
|
nameRegexp := regexp.MustCompile("func \\w+")
|
2017-02-20 17:17:28 +01:00
|
|
|
t.Run("platform", func(t *testing.T) {
|
|
|
|
|
for _, ats := range allAsmTests {
|
|
|
|
|
ats := ats
|
|
|
|
|
t.Run(ats.os+"/"+ats.arch, func(tt *testing.T) {
|
|
|
|
|
tt.Parallel()
|
|
|
|
|
|
|
|
|
|
asm := ats.compileToAsm(tt, dir)
|
|
|
|
|
|
2017-08-17 21:20:25 +02:00
|
|
|
for i, at := range ats.tests {
|
|
|
|
|
var funcName string
|
2017-08-28 09:55:18 -07:00
|
|
|
if strings.Contains(at.fn, "func $") {
|
2017-08-17 21:20:25 +02:00
|
|
|
funcName = fmt.Sprintf("f%d_%s", i, ats.arch)
|
|
|
|
|
} else {
|
2017-08-28 09:55:18 -07:00
|
|
|
funcName = nameRegexp.FindString(at.fn)[len("func "):]
|
2017-08-17 21:20:25 +02:00
|
|
|
}
|
2017-04-06 15:30:53 -07:00
|
|
|
fa := funcAsm(tt, asm, funcName)
|
|
|
|
|
if fa != "" {
|
|
|
|
|
at.verifyAsm(tt, fa)
|
|
|
|
|
}
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
})
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
2017-02-20 17:17:28 +01:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-06 15:30:53 -07:00
|
|
|
var nextTextRegexp = regexp.MustCompile(`\n\S`)
|
|
|
|
|
|
2017-03-18 11:16:30 -07:00
|
|
|
// funcAsm returns the assembly listing for the given function name.
|
2017-04-06 15:30:53 -07:00
|
|
|
func funcAsm(t *testing.T, asm string, funcName string) string {
|
2017-03-18 11:16:30 -07:00
|
|
|
if i := strings.Index(asm, fmt.Sprintf("TEXT\t\"\".%s(SB)", funcName)); i >= 0 {
|
2017-02-20 17:17:28 +01:00
|
|
|
asm = asm[i:]
|
2017-04-06 15:30:53 -07:00
|
|
|
} else {
|
|
|
|
|
t.Errorf("could not find assembly for function %v", funcName)
|
|
|
|
|
return ""
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
|
2017-04-06 15:30:53 -07:00
|
|
|
// Find the next line that doesn't begin with whitespace.
|
|
|
|
|
loc := nextTextRegexp.FindStringIndex(asm)
|
|
|
|
|
if loc != nil {
|
|
|
|
|
asm = asm[:loc[0]]
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return asm
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type asmTest struct {
|
2017-08-17 21:20:25 +02:00
|
|
|
// function to compile
|
2017-08-28 09:55:18 -07:00
|
|
|
fn string
|
|
|
|
|
// regular expressions that must match the generated assembly
|
|
|
|
|
pos []string
|
|
|
|
|
// regular expressions that must not match the generated assembly
|
|
|
|
|
neg []string
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (at asmTest) verifyAsm(t *testing.T, fa string) {
|
2017-08-28 09:55:18 -07:00
|
|
|
for _, r := range at.pos {
|
2017-02-20 17:17:28 +01:00
|
|
|
if b, err := regexp.MatchString(r, fa); !b || err != nil {
|
2017-08-28 09:55:18 -07:00
|
|
|
t.Errorf("expected:%s\ngo:%s\nasm:%s\n", r, at.fn, fa)
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
|
|
|
|
}
|
2017-08-28 09:55:18 -07:00
|
|
|
for _, r := range at.neg {
|
2017-08-18 14:03:33 -05:00
|
|
|
if b, err := regexp.MatchString(r, fa); b || err != nil {
|
2017-08-28 09:55:18 -07:00
|
|
|
t.Errorf("not expected:%s\ngo:%s\nasm:%s\n", r, at.fn, fa)
|
2017-08-18 14:03:33 -05:00
|
|
|
}
|
|
|
|
|
}
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
|
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
type asmTests struct {
|
|
|
|
|
arch string
|
|
|
|
|
os string
|
|
|
|
|
imports []string
|
|
|
|
|
tests []*asmTest
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (ats *asmTests) generateCode() []byte {
|
|
|
|
|
var buf bytes.Buffer
|
|
|
|
|
fmt.Fprintln(&buf, "package main")
|
|
|
|
|
for _, s := range ats.imports {
|
|
|
|
|
fmt.Fprintf(&buf, "import %q\n", s)
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-17 21:20:25 +02:00
|
|
|
for i, t := range ats.tests {
|
2017-08-28 09:55:18 -07:00
|
|
|
function := strings.Replace(t.fn, "func $", fmt.Sprintf("func f%d_%s", i, ats.arch), 1)
|
2017-08-17 21:20:25 +02:00
|
|
|
fmt.Fprintln(&buf, function)
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return buf.Bytes()
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-24 14:09:02 -07:00
|
|
|
// compile compiles the package pkg for architecture arch and
|
|
|
|
|
// returns the generated assembly. dir is a scratch directory.
|
2017-02-20 17:17:28 +01:00
|
|
|
func (ats *asmTests) compileToAsm(t *testing.T, dir string) string {
|
|
|
|
|
// create test directory
|
|
|
|
|
testDir := filepath.Join(dir, fmt.Sprintf("%s_%s", ats.arch, ats.os))
|
|
|
|
|
err := os.Mkdir(testDir, 0700)
|
|
|
|
|
if err != nil {
|
|
|
|
|
t.Fatalf("could not create directory: %v", err)
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-24 14:09:02 -07:00
|
|
|
// Create source.
|
2017-02-20 17:17:28 +01:00
|
|
|
src := filepath.Join(testDir, "test.go")
|
|
|
|
|
err = ioutil.WriteFile(src, ats.generateCode(), 0600)
|
2016-05-24 14:09:02 -07:00
|
|
|
if err != nil {
|
2017-02-20 17:17:28 +01:00
|
|
|
t.Fatalf("error writing code: %v", err)
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
|
|
|
|
|
2016-09-19 14:35:41 -07:00
|
|
|
// First, install any dependencies we need. This builds the required export data
|
|
|
|
|
// for any packages that are imported.
|
2017-02-20 17:17:28 +01:00
|
|
|
for _, i := range ats.imports {
|
|
|
|
|
out := filepath.Join(testDir, i+".a")
|
|
|
|
|
|
2017-02-24 21:40:57 -08:00
|
|
|
if s := ats.runGo(t, "build", "-o", out, "-gcflags=-dolinkobj=false", i); s != "" {
|
2017-02-20 17:17:28 +01:00
|
|
|
t.Fatalf("Stdout = %s\nWant empty", s)
|
|
|
|
|
}
|
2016-09-19 14:35:41 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Now, compile the individual file for which we want to see the generated assembly.
|
2017-02-20 17:17:28 +01:00
|
|
|
asm := ats.runGo(t, "tool", "compile", "-I", testDir, "-S", "-o", filepath.Join(testDir, "out.o"), src)
|
|
|
|
|
return asm
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// runGo runs go command with the given args and returns stdout string.
|
|
|
|
|
// go is run with GOARCH and GOOS set as ats.arch and ats.os respectively
|
|
|
|
|
func (ats *asmTests) runGo(t *testing.T, args ...string) string {
|
|
|
|
|
var stdout, stderr bytes.Buffer
|
|
|
|
|
cmd := exec.Command(testenv.GoToolPath(t), args...)
|
2017-03-02 14:12:09 -08:00
|
|
|
cmd.Env = append(os.Environ(), "GOARCH="+ats.arch, "GOOS="+ats.os)
|
2016-05-24 14:09:02 -07:00
|
|
|
cmd.Stdout = &stdout
|
|
|
|
|
cmd.Stderr = &stderr
|
2017-02-20 17:17:28 +01:00
|
|
|
|
2016-05-24 14:09:02 -07:00
|
|
|
if err := cmd.Run(); err != nil {
|
2017-04-16 15:19:55 -07:00
|
|
|
t.Fatalf("error running cmd: %v\nstdout:\n%sstderr:\n%s\n", err, stdout.String(), stderr.String())
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
2017-02-20 17:17:28 +01:00
|
|
|
|
2016-05-24 14:09:02 -07:00
|
|
|
if s := stderr.String(); s != "" {
|
2017-02-20 17:17:28 +01:00
|
|
|
t.Fatalf("Stderr = %s\nWant empty", s)
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
2017-02-20 17:17:28 +01:00
|
|
|
|
2016-05-24 14:09:02 -07:00
|
|
|
return stdout.String()
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
var allAsmTests = []*asmTests{
|
|
|
|
|
{
|
|
|
|
|
arch: "amd64",
|
|
|
|
|
os: "linux",
|
2017-08-18 14:03:33 -05:00
|
|
|
imports: []string{"encoding/binary", "math", "math/bits", "unsafe", "runtime"},
|
2017-02-20 17:17:28 +01:00
|
|
|
tests: linuxAMD64Tests,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
arch: "386",
|
|
|
|
|
os: "linux",
|
|
|
|
|
imports: []string{"encoding/binary"},
|
|
|
|
|
tests: linux386Tests,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
arch: "s390x",
|
|
|
|
|
os: "linux",
|
2017-09-08 01:31:13 +01:00
|
|
|
imports: []string{"encoding/binary", "math", "math/bits"},
|
2017-02-20 17:17:28 +01:00
|
|
|
tests: linuxS390XTests,
|
|
|
|
|
},
|
|
|
|
|
{
|
2017-03-16 14:08:31 -07:00
|
|
|
arch: "arm",
|
|
|
|
|
os: "linux",
|
2017-10-24 21:57:51 -07:00
|
|
|
imports: []string{"math/bits", "runtime"},
|
2017-03-16 14:08:31 -07:00
|
|
|
tests: linuxARMTests,
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-03-15 21:28:29 -07:00
|
|
|
arch: "arm64",
|
|
|
|
|
os: "linux",
|
2018-02-16 09:22:32 -05:00
|
|
|
imports: []string{"encoding/binary", "math", "math/bits"},
|
2017-03-15 21:28:29 -07:00
|
|
|
tests: linuxARM64Tests,
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
2017-03-16 14:08:31 -07:00
|
|
|
{
|
|
|
|
|
arch: "mips",
|
|
|
|
|
os: "linux",
|
|
|
|
|
imports: []string{"math/bits"},
|
|
|
|
|
tests: linuxMIPSTests,
|
|
|
|
|
},
|
2017-08-30 12:11:29 -04:00
|
|
|
{
|
|
|
|
|
arch: "mips64",
|
|
|
|
|
os: "linux",
|
|
|
|
|
tests: linuxMIPS64Tests,
|
|
|
|
|
},
|
2017-03-13 14:39:17 -04:00
|
|
|
{
|
2017-09-08 01:31:13 +01:00
|
|
|
arch: "ppc64le",
|
|
|
|
|
os: "linux",
|
2017-10-30 12:30:45 -04:00
|
|
|
imports: []string{"encoding/binary", "math", "math/bits"},
|
2017-09-08 01:31:13 +01:00
|
|
|
tests: linuxPPC64LETests,
|
2017-03-13 14:39:17 -04:00
|
|
|
},
|
2017-08-26 23:05:36 +02:00
|
|
|
{
|
|
|
|
|
arch: "amd64",
|
|
|
|
|
os: "plan9",
|
|
|
|
|
tests: plan9AMD64Tests,
|
|
|
|
|
},
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
|
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
var linuxAMD64Tests = []*asmTest{
|
2017-10-05 16:05:03 +02:00
|
|
|
// multiplication by powers of two
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-10-05 16:05:03 +02:00
|
|
|
func $(n int) int {
|
|
|
|
|
return n * 64
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tSHLQ\t\\$6,"},
|
2017-10-05 16:05:03 +02:00
|
|
|
neg: []string{"IMULQ"},
|
2016-05-24 14:09:02 -07:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-10-05 16:05:03 +02:00
|
|
|
func $(n int) int {
|
|
|
|
|
return -128*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"SHLQ"},
|
|
|
|
|
neg: []string{"IMULQ"},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x int) int {
|
2017-02-20 17:17:28 +01:00
|
|
|
return x * 96
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tSHLQ\t\\$5,", "\tLEAQ\t\\(.*\\)\\(.*\\*2\\),"},
|
2016-05-24 14:09:02 -07:00
|
|
|
},
|
2016-09-19 14:35:41 -07:00
|
|
|
// Load-combining tests.
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f2(b []byte) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t\\(.*\\),"},
|
2016-09-19 14:35:41 -07:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f3(b []byte, i int) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t\\(.*\\)\\(.*\\*1\\),"},
|
2016-09-19 14:35:41 -07:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f4(b []byte) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\t\\(.*\\),"},
|
2016-09-19 14:35:41 -07:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f5(b []byte, i int) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
|
2016-09-19 14:35:41 -07:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f6(b []byte) uint64 {
|
|
|
|
|
return binary.BigEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPQ\t"},
|
2016-10-27 16:58:45 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f7(b []byte, i int) uint64 {
|
|
|
|
|
return binary.BigEndian.Uint64(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPQ\t"},
|
2016-10-27 16:58:45 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f8(b []byte, v uint64) {
|
|
|
|
|
binary.BigEndian.PutUint64(b, v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPQ\t"},
|
2016-12-01 21:40:57 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f9(b []byte, i int, v uint64) {
|
|
|
|
|
binary.BigEndian.PutUint64(b[i:], v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPQ\t"},
|
cmd/compile/internal/ssa: generate bswap/store for indexed bigendian byte stores too on AMD64
Commit 10f75748 (CL 32222) added rewrite rules to combine byte loads/stores +
shifts into larger loads/stores + bswap. For loads both MOVBload and
MOVBloadidx1 were handled but for store only MOVBstore was there without
MOVBstoreidx added to rewrite pattern. Fix it.
Here is how generated code changes for the following 2 functions
(ommitting staying the same prologue/epilogue):
func put32(b []byte, i int, v uint32) {
binary.BigEndian.PutUint32(b[i:], v)
}
func put64(b []byte, i int, v uint64) {
binary.BigEndian.PutUint64(b[i:], v)
}
"".put32 t=1 size=100 args=0x28 locals=0x0
// before
0x0032 00050 (x.go:5) MOVL CX, DX
0x0034 00052 (x.go:5) SHRL $24, CX
0x0037 00055 (x.go:5) MOVQ "".b+8(FP), BX
0x003c 00060 (x.go:5) MOVB CL, (BX)(AX*1)
0x003f 00063 (x.go:5) MOVL DX, CX
0x0041 00065 (x.go:5) SHRL $16, DX
0x0044 00068 (x.go:5) MOVB DL, 1(BX)(AX*1)
0x0048 00072 (x.go:5) MOVL CX, DX
0x004a 00074 (x.go:5) SHRL $8, CX
0x004d 00077 (x.go:5) MOVB CL, 2(BX)(AX*1)
0x0051 00081 (x.go:5) MOVB DL, 3(BX)(AX*1)
// after
0x0032 00050 (x.go:5) BSWAPL CX
0x0034 00052 (x.go:5) MOVQ "".b+8(FP), DX
0x0039 00057 (x.go:5) MOVL CX, (DX)(AX*1)
"".put64 t=1 size=155 args=0x28 locals=0x0
// before
0x0037 00055 (x.go:9) MOVQ CX, DX
0x003a 00058 (x.go:9) SHRQ $56, CX
0x003e 00062 (x.go:9) MOVQ "".b+8(FP), BX
0x0043 00067 (x.go:9) MOVB CL, (BX)(AX*1)
0x0046 00070 (x.go:9) MOVQ DX, CX
0x0049 00073 (x.go:9) SHRQ $48, DX
0x004d 00077 (x.go:9) MOVB DL, 1(BX)(AX*1)
0x0051 00081 (x.go:9) MOVQ CX, DX
0x0054 00084 (x.go:9) SHRQ $40, CX
0x0058 00088 (x.go:9) MOVB CL, 2(BX)(AX*1)
0x005c 00092 (x.go:9) MOVQ DX, CX
0x005f 00095 (x.go:9) SHRQ $32, DX
0x0063 00099 (x.go:9) MOVB DL, 3(BX)(AX*1)
0x0067 00103 (x.go:9) MOVQ CX, DX
0x006a 00106 (x.go:9) SHRQ $24, CX
0x006e 00110 (x.go:9) MOVB CL, 4(BX)(AX*1)
0x0072 00114 (x.go:9) MOVQ DX, CX
0x0075 00117 (x.go:9) SHRQ $16, DX
0x0079 00121 (x.go:9) MOVB DL, 5(BX)(AX*1)
0x007d 00125 (x.go:9) MOVQ CX, DX
0x0080 00128 (x.go:9) SHRQ $8, CX
0x0084 00132 (x.go:9) MOVB CL, 6(BX)(AX*1)
0x0088 00136 (x.go:9) MOVB DL, 7(BX)(AX*1)
// after
0x0033 00051 (x.go:9) BSWAPQ CX
0x0036 00054 (x.go:9) MOVQ "".b+8(FP), DX
0x003b 00059 (x.go:9) MOVQ CX, (DX)(AX*1)
Updates #17151
Change-Id: I3f4a7f28f210e62e153e60da5abd1d39508cc6c4
Reviewed-on: https://go-review.googlesource.com/34635
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2016-12-01 22:13:16 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f10(b []byte) uint32 {
|
|
|
|
|
return binary.BigEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPL\t"},
|
2016-10-27 16:58:45 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f11(b []byte, i int) uint32 {
|
|
|
|
|
return binary.BigEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPL\t"},
|
2016-12-01 21:40:57 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f12(b []byte, v uint32) {
|
|
|
|
|
binary.BigEndian.PutUint32(b, v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPL\t"},
|
cmd/compile/internal/ssa: generate bswap/store for indexed bigendian byte stores too on AMD64
Commit 10f75748 (CL 32222) added rewrite rules to combine byte loads/stores +
shifts into larger loads/stores + bswap. For loads both MOVBload and
MOVBloadidx1 were handled but for store only MOVBstore was there without
MOVBstoreidx added to rewrite pattern. Fix it.
Here is how generated code changes for the following 2 functions
(ommitting staying the same prologue/epilogue):
func put32(b []byte, i int, v uint32) {
binary.BigEndian.PutUint32(b[i:], v)
}
func put64(b []byte, i int, v uint64) {
binary.BigEndian.PutUint64(b[i:], v)
}
"".put32 t=1 size=100 args=0x28 locals=0x0
// before
0x0032 00050 (x.go:5) MOVL CX, DX
0x0034 00052 (x.go:5) SHRL $24, CX
0x0037 00055 (x.go:5) MOVQ "".b+8(FP), BX
0x003c 00060 (x.go:5) MOVB CL, (BX)(AX*1)
0x003f 00063 (x.go:5) MOVL DX, CX
0x0041 00065 (x.go:5) SHRL $16, DX
0x0044 00068 (x.go:5) MOVB DL, 1(BX)(AX*1)
0x0048 00072 (x.go:5) MOVL CX, DX
0x004a 00074 (x.go:5) SHRL $8, CX
0x004d 00077 (x.go:5) MOVB CL, 2(BX)(AX*1)
0x0051 00081 (x.go:5) MOVB DL, 3(BX)(AX*1)
// after
0x0032 00050 (x.go:5) BSWAPL CX
0x0034 00052 (x.go:5) MOVQ "".b+8(FP), DX
0x0039 00057 (x.go:5) MOVL CX, (DX)(AX*1)
"".put64 t=1 size=155 args=0x28 locals=0x0
// before
0x0037 00055 (x.go:9) MOVQ CX, DX
0x003a 00058 (x.go:9) SHRQ $56, CX
0x003e 00062 (x.go:9) MOVQ "".b+8(FP), BX
0x0043 00067 (x.go:9) MOVB CL, (BX)(AX*1)
0x0046 00070 (x.go:9) MOVQ DX, CX
0x0049 00073 (x.go:9) SHRQ $48, DX
0x004d 00077 (x.go:9) MOVB DL, 1(BX)(AX*1)
0x0051 00081 (x.go:9) MOVQ CX, DX
0x0054 00084 (x.go:9) SHRQ $40, CX
0x0058 00088 (x.go:9) MOVB CL, 2(BX)(AX*1)
0x005c 00092 (x.go:9) MOVQ DX, CX
0x005f 00095 (x.go:9) SHRQ $32, DX
0x0063 00099 (x.go:9) MOVB DL, 3(BX)(AX*1)
0x0067 00103 (x.go:9) MOVQ CX, DX
0x006a 00106 (x.go:9) SHRQ $24, CX
0x006e 00110 (x.go:9) MOVB CL, 4(BX)(AX*1)
0x0072 00114 (x.go:9) MOVQ DX, CX
0x0075 00117 (x.go:9) SHRQ $16, DX
0x0079 00121 (x.go:9) MOVB DL, 5(BX)(AX*1)
0x007d 00125 (x.go:9) MOVQ CX, DX
0x0080 00128 (x.go:9) SHRQ $8, CX
0x0084 00132 (x.go:9) MOVB CL, 6(BX)(AX*1)
0x0088 00136 (x.go:9) MOVB DL, 7(BX)(AX*1)
// after
0x0033 00051 (x.go:9) BSWAPQ CX
0x0036 00054 (x.go:9) MOVQ "".b+8(FP), DX
0x003b 00059 (x.go:9) MOVQ CX, (DX)(AX*1)
Updates #17151
Change-Id: I3f4a7f28f210e62e153e60da5abd1d39508cc6c4
Reviewed-on: https://go-review.googlesource.com/34635
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2016-12-01 22:13:16 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f13(b []byte, i int, v uint32) {
|
|
|
|
|
binary.BigEndian.PutUint32(b[i:], v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPL\t"},
|
2016-10-27 16:58:45 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f14(b []byte) uint16 {
|
|
|
|
|
return binary.BigEndian.Uint16(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t\\$8,"},
|
cmd/compile/internal/ssa: combine 2 byte loads + shifts into word load + rolw 8 on AMD64
... and same for stores. This does for binary.BigEndian.Uint16() what
was already done for Uint32 and Uint64 with BSWAP in 10f75748 (CL 32222).
Here is how generated code changes e.g. for the following function
(omitting saying the same prologue/epilogue):
func get16(b [2]byte) uint16 {
return binary.BigEndian.Uint16(b[:])
}
"".get16 t=1 size=21 args=0x10 locals=0x0
// before
0x0000 00000 (x.go:15) MOVBLZX "".b+9(FP), AX
0x0005 00005 (x.go:15) MOVBLZX "".b+8(FP), CX
0x000a 00010 (x.go:15) SHLL $8, CX
0x000d 00013 (x.go:15) ORL CX, AX
// after
0x0000 00000 (x.go:15) MOVWLZX "".b+8(FP), AX
0x0005 00005 (x.go:15) ROLW $8, AX
encoding/binary is speedup overall a bit:
name old time/op new time/op delta
ReadSlice1000Int32s-4 4.83µs ± 0% 4.83µs ± 0% ~ (p=0.206 n=4+5)
ReadStruct-4 1.29µs ± 2% 1.28µs ± 1% -1.27% (p=0.032 n=4+5)
ReadInts-4 384ns ± 1% 385ns ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 534ns ± 3% 526ns ± 0% -1.54% (p=0.048 n=4+5)
WriteSlice1000Int32s-4 5.02µs ± 0% 5.11µs ± 3% ~ (p=0.175 n=4+5)
PutUint16-4 0.59ns ± 0% 0.49ns ± 2% -16.95% (p=0.016 n=4+5)
PutUint32-4 0.52ns ± 0% 0.52ns ± 0% ~ (all equal)
PutUint64-4 0.53ns ± 0% 0.53ns ± 0% ~ (all equal)
PutUvarint32-4 19.9ns ± 0% 19.9ns ± 1% ~ (p=0.556 n=4+5)
PutUvarint64-4 54.5ns ± 1% 54.2ns ± 0% ~ (p=0.333 n=4+5)
name old speed new speed delta
ReadSlice1000Int32s-4 829MB/s ± 0% 828MB/s ± 0% ~ (p=0.190 n=4+5)
ReadStruct-4 58.0MB/s ± 2% 58.7MB/s ± 1% +1.30% (p=0.032 n=4+5)
ReadInts-4 78.0MB/s ± 1% 77.8MB/s ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 56.1MB/s ± 3% 57.0MB/s ± 0% ~ (p=0.063 n=4+5)
WriteSlice1000Int32s-4 797MB/s ± 0% 783MB/s ± 3% ~ (p=0.190 n=4+5)
PutUint16-4 3.37GB/s ± 0% 4.07GB/s ± 2% +20.83% (p=0.016 n=4+5)
PutUint32-4 7.73GB/s ± 0% 7.72GB/s ± 0% ~ (p=0.556 n=4+5)
PutUint64-4 15.1GB/s ± 0% 15.1GB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint32-4 201MB/s ± 0% 201MB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint64-4 147MB/s ± 1% 147MB/s ± 0% ~ (p=0.286 n=4+5)
( "a bit" only because most of the time is spent in reflection-like things
there, not actual bytes decoding. Even for direct PutUint16 benchmark the
looping adds overhead and lowers visible benefit. For code-generated encoders /
decoders actual effect is more than 20% )
Adding Uint32 and Uint64 raw benchmarks too for completeness.
NOTE I had to adjust load-combining rule for bswap case to match first 2 bytes
loads as result of "2-bytes load+shift" -> "loadw + rorw 8" rewrite. Reason is:
for loads+shift, even e.g. into uint16 var
var b []byte
var v uin16
v = uint16(b[1]) | uint16(b[0])<<8
the compiler eventually generates L(ong) shift - SHLLconst [8], probably
because it is more straightforward / other reasons to work on the whole
register. This way 2 bytes rewriting rule is using SHLLconst (not SHLWconst) in
its pattern, and then it always gets matched first, even if 2-byte rule comes
syntactically after 4-byte rule in AMD64.rules because 4-bytes rule seemingly
needs more applyRewrite() cycles to trigger. If 2-bytes rule gets matched for
inner half of
var b []byte
var v uin32
v = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
and we keep 4-byte load rule unchanged, the result will be MOVW + RORW $8 and
then series of byte loads and shifts - not one MOVL + BSWAPL.
There is no such problem for stores: there compiler, since it probably knows
store destination is 2 bytes wide, uses SHRWconst 8 (not SHRLconst 8) and thus
2-byte store rule is not a subset of rule for 4-byte stores.
Fixes #17151 (int16 was last missing piece there)
Change-Id: Idc03ba965bfce2b94fef456b02ff6742194748f6
Reviewed-on: https://go-review.googlesource.com/34636
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-12-01 23:43:21 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f15(b []byte, i int) uint16 {
|
|
|
|
|
return binary.BigEndian.Uint16(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t\\$8,"},
|
cmd/compile/internal/ssa: combine 2 byte loads + shifts into word load + rolw 8 on AMD64
... and same for stores. This does for binary.BigEndian.Uint16() what
was already done for Uint32 and Uint64 with BSWAP in 10f75748 (CL 32222).
Here is how generated code changes e.g. for the following function
(omitting saying the same prologue/epilogue):
func get16(b [2]byte) uint16 {
return binary.BigEndian.Uint16(b[:])
}
"".get16 t=1 size=21 args=0x10 locals=0x0
// before
0x0000 00000 (x.go:15) MOVBLZX "".b+9(FP), AX
0x0005 00005 (x.go:15) MOVBLZX "".b+8(FP), CX
0x000a 00010 (x.go:15) SHLL $8, CX
0x000d 00013 (x.go:15) ORL CX, AX
// after
0x0000 00000 (x.go:15) MOVWLZX "".b+8(FP), AX
0x0005 00005 (x.go:15) ROLW $8, AX
encoding/binary is speedup overall a bit:
name old time/op new time/op delta
ReadSlice1000Int32s-4 4.83µs ± 0% 4.83µs ± 0% ~ (p=0.206 n=4+5)
ReadStruct-4 1.29µs ± 2% 1.28µs ± 1% -1.27% (p=0.032 n=4+5)
ReadInts-4 384ns ± 1% 385ns ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 534ns ± 3% 526ns ± 0% -1.54% (p=0.048 n=4+5)
WriteSlice1000Int32s-4 5.02µs ± 0% 5.11µs ± 3% ~ (p=0.175 n=4+5)
PutUint16-4 0.59ns ± 0% 0.49ns ± 2% -16.95% (p=0.016 n=4+5)
PutUint32-4 0.52ns ± 0% 0.52ns ± 0% ~ (all equal)
PutUint64-4 0.53ns ± 0% 0.53ns ± 0% ~ (all equal)
PutUvarint32-4 19.9ns ± 0% 19.9ns ± 1% ~ (p=0.556 n=4+5)
PutUvarint64-4 54.5ns ± 1% 54.2ns ± 0% ~ (p=0.333 n=4+5)
name old speed new speed delta
ReadSlice1000Int32s-4 829MB/s ± 0% 828MB/s ± 0% ~ (p=0.190 n=4+5)
ReadStruct-4 58.0MB/s ± 2% 58.7MB/s ± 1% +1.30% (p=0.032 n=4+5)
ReadInts-4 78.0MB/s ± 1% 77.8MB/s ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 56.1MB/s ± 3% 57.0MB/s ± 0% ~ (p=0.063 n=4+5)
WriteSlice1000Int32s-4 797MB/s ± 0% 783MB/s ± 3% ~ (p=0.190 n=4+5)
PutUint16-4 3.37GB/s ± 0% 4.07GB/s ± 2% +20.83% (p=0.016 n=4+5)
PutUint32-4 7.73GB/s ± 0% 7.72GB/s ± 0% ~ (p=0.556 n=4+5)
PutUint64-4 15.1GB/s ± 0% 15.1GB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint32-4 201MB/s ± 0% 201MB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint64-4 147MB/s ± 1% 147MB/s ± 0% ~ (p=0.286 n=4+5)
( "a bit" only because most of the time is spent in reflection-like things
there, not actual bytes decoding. Even for direct PutUint16 benchmark the
looping adds overhead and lowers visible benefit. For code-generated encoders /
decoders actual effect is more than 20% )
Adding Uint32 and Uint64 raw benchmarks too for completeness.
NOTE I had to adjust load-combining rule for bswap case to match first 2 bytes
loads as result of "2-bytes load+shift" -> "loadw + rorw 8" rewrite. Reason is:
for loads+shift, even e.g. into uint16 var
var b []byte
var v uin16
v = uint16(b[1]) | uint16(b[0])<<8
the compiler eventually generates L(ong) shift - SHLLconst [8], probably
because it is more straightforward / other reasons to work on the whole
register. This way 2 bytes rewriting rule is using SHLLconst (not SHLWconst) in
its pattern, and then it always gets matched first, even if 2-byte rule comes
syntactically after 4-byte rule in AMD64.rules because 4-bytes rule seemingly
needs more applyRewrite() cycles to trigger. If 2-bytes rule gets matched for
inner half of
var b []byte
var v uin32
v = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
and we keep 4-byte load rule unchanged, the result will be MOVW + RORW $8 and
then series of byte loads and shifts - not one MOVL + BSWAPL.
There is no such problem for stores: there compiler, since it probably knows
store destination is 2 bytes wide, uses SHRWconst 8 (not SHRLconst 8) and thus
2-byte store rule is not a subset of rule for 4-byte stores.
Fixes #17151 (int16 was last missing piece there)
Change-Id: Idc03ba965bfce2b94fef456b02ff6742194748f6
Reviewed-on: https://go-review.googlesource.com/34636
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-12-01 23:43:21 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f16(b []byte, v uint16) {
|
|
|
|
|
binary.BigEndian.PutUint16(b, v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t\\$8,"},
|
cmd/compile/internal/ssa: combine 2 byte loads + shifts into word load + rolw 8 on AMD64
... and same for stores. This does for binary.BigEndian.Uint16() what
was already done for Uint32 and Uint64 with BSWAP in 10f75748 (CL 32222).
Here is how generated code changes e.g. for the following function
(omitting saying the same prologue/epilogue):
func get16(b [2]byte) uint16 {
return binary.BigEndian.Uint16(b[:])
}
"".get16 t=1 size=21 args=0x10 locals=0x0
// before
0x0000 00000 (x.go:15) MOVBLZX "".b+9(FP), AX
0x0005 00005 (x.go:15) MOVBLZX "".b+8(FP), CX
0x000a 00010 (x.go:15) SHLL $8, CX
0x000d 00013 (x.go:15) ORL CX, AX
// after
0x0000 00000 (x.go:15) MOVWLZX "".b+8(FP), AX
0x0005 00005 (x.go:15) ROLW $8, AX
encoding/binary is speedup overall a bit:
name old time/op new time/op delta
ReadSlice1000Int32s-4 4.83µs ± 0% 4.83µs ± 0% ~ (p=0.206 n=4+5)
ReadStruct-4 1.29µs ± 2% 1.28µs ± 1% -1.27% (p=0.032 n=4+5)
ReadInts-4 384ns ± 1% 385ns ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 534ns ± 3% 526ns ± 0% -1.54% (p=0.048 n=4+5)
WriteSlice1000Int32s-4 5.02µs ± 0% 5.11µs ± 3% ~ (p=0.175 n=4+5)
PutUint16-4 0.59ns ± 0% 0.49ns ± 2% -16.95% (p=0.016 n=4+5)
PutUint32-4 0.52ns ± 0% 0.52ns ± 0% ~ (all equal)
PutUint64-4 0.53ns ± 0% 0.53ns ± 0% ~ (all equal)
PutUvarint32-4 19.9ns ± 0% 19.9ns ± 1% ~ (p=0.556 n=4+5)
PutUvarint64-4 54.5ns ± 1% 54.2ns ± 0% ~ (p=0.333 n=4+5)
name old speed new speed delta
ReadSlice1000Int32s-4 829MB/s ± 0% 828MB/s ± 0% ~ (p=0.190 n=4+5)
ReadStruct-4 58.0MB/s ± 2% 58.7MB/s ± 1% +1.30% (p=0.032 n=4+5)
ReadInts-4 78.0MB/s ± 1% 77.8MB/s ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 56.1MB/s ± 3% 57.0MB/s ± 0% ~ (p=0.063 n=4+5)
WriteSlice1000Int32s-4 797MB/s ± 0% 783MB/s ± 3% ~ (p=0.190 n=4+5)
PutUint16-4 3.37GB/s ± 0% 4.07GB/s ± 2% +20.83% (p=0.016 n=4+5)
PutUint32-4 7.73GB/s ± 0% 7.72GB/s ± 0% ~ (p=0.556 n=4+5)
PutUint64-4 15.1GB/s ± 0% 15.1GB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint32-4 201MB/s ± 0% 201MB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint64-4 147MB/s ± 1% 147MB/s ± 0% ~ (p=0.286 n=4+5)
( "a bit" only because most of the time is spent in reflection-like things
there, not actual bytes decoding. Even for direct PutUint16 benchmark the
looping adds overhead and lowers visible benefit. For code-generated encoders /
decoders actual effect is more than 20% )
Adding Uint32 and Uint64 raw benchmarks too for completeness.
NOTE I had to adjust load-combining rule for bswap case to match first 2 bytes
loads as result of "2-bytes load+shift" -> "loadw + rorw 8" rewrite. Reason is:
for loads+shift, even e.g. into uint16 var
var b []byte
var v uin16
v = uint16(b[1]) | uint16(b[0])<<8
the compiler eventually generates L(ong) shift - SHLLconst [8], probably
because it is more straightforward / other reasons to work on the whole
register. This way 2 bytes rewriting rule is using SHLLconst (not SHLWconst) in
its pattern, and then it always gets matched first, even if 2-byte rule comes
syntactically after 4-byte rule in AMD64.rules because 4-bytes rule seemingly
needs more applyRewrite() cycles to trigger. If 2-bytes rule gets matched for
inner half of
var b []byte
var v uin32
v = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
and we keep 4-byte load rule unchanged, the result will be MOVW + RORW $8 and
then series of byte loads and shifts - not one MOVL + BSWAPL.
There is no such problem for stores: there compiler, since it probably knows
store destination is 2 bytes wide, uses SHRWconst 8 (not SHRLconst 8) and thus
2-byte store rule is not a subset of rule for 4-byte stores.
Fixes #17151 (int16 was last missing piece there)
Change-Id: Idc03ba965bfce2b94fef456b02ff6742194748f6
Reviewed-on: https://go-review.googlesource.com/34636
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-12-01 23:43:21 +03:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f17(b []byte, i int, v uint16) {
|
|
|
|
|
binary.BigEndian.PutUint16(b[i:], v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t\\$8,"},
|
cmd/compile/internal/ssa: combine 2 byte loads + shifts into word load + rolw 8 on AMD64
... and same for stores. This does for binary.BigEndian.Uint16() what
was already done for Uint32 and Uint64 with BSWAP in 10f75748 (CL 32222).
Here is how generated code changes e.g. for the following function
(omitting saying the same prologue/epilogue):
func get16(b [2]byte) uint16 {
return binary.BigEndian.Uint16(b[:])
}
"".get16 t=1 size=21 args=0x10 locals=0x0
// before
0x0000 00000 (x.go:15) MOVBLZX "".b+9(FP), AX
0x0005 00005 (x.go:15) MOVBLZX "".b+8(FP), CX
0x000a 00010 (x.go:15) SHLL $8, CX
0x000d 00013 (x.go:15) ORL CX, AX
// after
0x0000 00000 (x.go:15) MOVWLZX "".b+8(FP), AX
0x0005 00005 (x.go:15) ROLW $8, AX
encoding/binary is speedup overall a bit:
name old time/op new time/op delta
ReadSlice1000Int32s-4 4.83µs ± 0% 4.83µs ± 0% ~ (p=0.206 n=4+5)
ReadStruct-4 1.29µs ± 2% 1.28µs ± 1% -1.27% (p=0.032 n=4+5)
ReadInts-4 384ns ± 1% 385ns ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 534ns ± 3% 526ns ± 0% -1.54% (p=0.048 n=4+5)
WriteSlice1000Int32s-4 5.02µs ± 0% 5.11µs ± 3% ~ (p=0.175 n=4+5)
PutUint16-4 0.59ns ± 0% 0.49ns ± 2% -16.95% (p=0.016 n=4+5)
PutUint32-4 0.52ns ± 0% 0.52ns ± 0% ~ (all equal)
PutUint64-4 0.53ns ± 0% 0.53ns ± 0% ~ (all equal)
PutUvarint32-4 19.9ns ± 0% 19.9ns ± 1% ~ (p=0.556 n=4+5)
PutUvarint64-4 54.5ns ± 1% 54.2ns ± 0% ~ (p=0.333 n=4+5)
name old speed new speed delta
ReadSlice1000Int32s-4 829MB/s ± 0% 828MB/s ± 0% ~ (p=0.190 n=4+5)
ReadStruct-4 58.0MB/s ± 2% 58.7MB/s ± 1% +1.30% (p=0.032 n=4+5)
ReadInts-4 78.0MB/s ± 1% 77.8MB/s ± 1% ~ (p=0.968 n=4+5)
WriteInts-4 56.1MB/s ± 3% 57.0MB/s ± 0% ~ (p=0.063 n=4+5)
WriteSlice1000Int32s-4 797MB/s ± 0% 783MB/s ± 3% ~ (p=0.190 n=4+5)
PutUint16-4 3.37GB/s ± 0% 4.07GB/s ± 2% +20.83% (p=0.016 n=4+5)
PutUint32-4 7.73GB/s ± 0% 7.72GB/s ± 0% ~ (p=0.556 n=4+5)
PutUint64-4 15.1GB/s ± 0% 15.1GB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint32-4 201MB/s ± 0% 201MB/s ± 0% ~ (p=0.905 n=4+5)
PutUvarint64-4 147MB/s ± 1% 147MB/s ± 0% ~ (p=0.286 n=4+5)
( "a bit" only because most of the time is spent in reflection-like things
there, not actual bytes decoding. Even for direct PutUint16 benchmark the
looping adds overhead and lowers visible benefit. For code-generated encoders /
decoders actual effect is more than 20% )
Adding Uint32 and Uint64 raw benchmarks too for completeness.
NOTE I had to adjust load-combining rule for bswap case to match first 2 bytes
loads as result of "2-bytes load+shift" -> "loadw + rorw 8" rewrite. Reason is:
for loads+shift, even e.g. into uint16 var
var b []byte
var v uin16
v = uint16(b[1]) | uint16(b[0])<<8
the compiler eventually generates L(ong) shift - SHLLconst [8], probably
because it is more straightforward / other reasons to work on the whole
register. This way 2 bytes rewriting rule is using SHLLconst (not SHLWconst) in
its pattern, and then it always gets matched first, even if 2-byte rule comes
syntactically after 4-byte rule in AMD64.rules because 4-bytes rule seemingly
needs more applyRewrite() cycles to trigger. If 2-bytes rule gets matched for
inner half of
var b []byte
var v uin32
v = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
and we keep 4-byte load rule unchanged, the result will be MOVW + RORW $8 and
then series of byte loads and shifts - not one MOVL + BSWAPL.
There is no such problem for stores: there compiler, since it probably knows
store destination is 2 bytes wide, uses SHRWconst 8 (not SHRLconst 8) and thus
2-byte store rule is not a subset of rule for 4-byte stores.
Fixes #17151 (int16 was last missing piece there)
Change-Id: Idc03ba965bfce2b94fef456b02ff6742194748f6
Reviewed-on: https://go-review.googlesource.com/34636
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-12-01 23:43:21 +03:00
|
|
|
},
|
2016-12-19 09:15:04 -08:00
|
|
|
// Structure zeroing. See issue #18370.
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
type T1 struct {
|
|
|
|
|
a, b, c int
|
|
|
|
|
}
|
2017-08-17 21:20:25 +02:00
|
|
|
func $(t *T1) {
|
2017-02-20 17:17:28 +01:00
|
|
|
*t = T1{}
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)"},
|
2016-12-19 09:15:04 -08:00
|
|
|
},
|
2017-05-10 12:48:17 -07:00
|
|
|
// SSA-able composite literal initialization. Issue 18872.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-05-10 12:48:17 -07:00
|
|
|
type T18872 struct {
|
|
|
|
|
a, b, c, d int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func f18872(p *T18872) {
|
|
|
|
|
*p = T18872{1, 2, 3, 4}
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t[$]1", "\tMOVQ\t[$]2", "\tMOVQ\t[$]3", "\tMOVQ\t[$]4"},
|
2017-05-10 12:48:17 -07:00
|
|
|
},
|
2017-02-14 11:01:04 -05:00
|
|
|
// Also test struct containing pointers (this was special because of write barriers).
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
type T2 struct {
|
|
|
|
|
a, b, c *int
|
|
|
|
|
}
|
|
|
|
|
func f19(t *T2) {
|
|
|
|
|
*t = T2{}
|
|
|
|
|
}
|
|
|
|
|
`,
|
2018-01-15 12:27:17 -05:00
|
|
|
pos: []string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)", "\tCALL\truntime\\.gcWriteBarrier\\(SB\\)"},
|
2017-02-14 11:01:04 -05:00
|
|
|
},
|
2016-12-08 16:17:20 -08:00
|
|
|
// Rotate tests
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f20(x uint64) uint64 {
|
|
|
|
|
return x<<7 | x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLQ\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f21(x uint64) uint64 {
|
|
|
|
|
return x<<7 + x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLQ\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f22(x uint64) uint64 {
|
|
|
|
|
return x<<7 ^ x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLQ\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f23(x uint32) uint32 {
|
|
|
|
|
return x<<7 + x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f24(x uint32) uint32 {
|
|
|
|
|
return x<<7 | x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f25(x uint32) uint32 {
|
|
|
|
|
return x<<7 ^ x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f26(x uint16) uint16 {
|
|
|
|
|
return x<<7 + x>>9
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f27(x uint16) uint16 {
|
|
|
|
|
return x<<7 | x>>9
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f28(x uint16) uint16 {
|
|
|
|
|
return x<<7 ^ x>>9
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f29(x uint8) uint8 {
|
|
|
|
|
return x<<7 + x>>1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLB\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f30(x uint8) uint8 {
|
|
|
|
|
return x<<7 | x>>1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLB\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f31(x uint8) uint8 {
|
|
|
|
|
return x<<7 ^ x>>1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLB\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
// Rotate after inlining (see issue 18254).
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f32(x uint32) uint32 {
|
|
|
|
|
return g(x, 7)
|
|
|
|
|
}
|
2017-03-18 11:16:30 -07:00
|
|
|
func g(x uint32, k uint) uint32 {
|
|
|
|
|
return x<<k | x>>(32-k)
|
|
|
|
|
}
|
2017-02-20 17:17:28 +01:00
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f33(m map[int]int) int {
|
|
|
|
|
return m[5]
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t[$]5,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
// Direct use of constants in fast map access calls. Issue 19015.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f34(m map[int]int) bool {
|
|
|
|
|
_, ok := m[5]
|
|
|
|
|
return ok
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t[$]5,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f35(m map[string]int) int {
|
|
|
|
|
return m["abc"]
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\"abc\""},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f36(m map[string]int) bool {
|
|
|
|
|
_, ok := m["abc"]
|
|
|
|
|
return ok
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\"abc\""},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
2017-02-06 10:55:39 -08:00
|
|
|
// Bit test ops on amd64, issue 18943.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-06 10:55:39 -08:00
|
|
|
func f37(a, b uint64) int {
|
|
|
|
|
if a&(1<<(b&63)) != 0 {
|
|
|
|
|
return 1
|
|
|
|
|
}
|
|
|
|
|
return -1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBTQ\t"},
|
2017-02-06 10:55:39 -08:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-06 10:55:39 -08:00
|
|
|
func f38(a, b uint64) bool {
|
|
|
|
|
return a&(1<<(b&63)) != 0
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBTQ\t"},
|
2017-02-06 10:55:39 -08:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-06 10:55:39 -08:00
|
|
|
func f39(a uint64) int {
|
|
|
|
|
if a&(1<<60) != 0 {
|
|
|
|
|
return 1
|
|
|
|
|
}
|
|
|
|
|
return -1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBTQ\t\\$60"},
|
2017-02-06 10:55:39 -08:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-06 10:55:39 -08:00
|
|
|
func f40(a uint64) bool {
|
|
|
|
|
return a&(1<<60) != 0
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBTQ\t\\$60"},
|
2017-02-06 10:55:39 -08:00
|
|
|
},
|
2017-03-14 13:25:12 -07:00
|
|
|
// Intrinsic tests for math/bits
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f41(a uint64) int {
|
|
|
|
|
return bits.TrailingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSFQ\t", "\tMOVL\t\\$64,", "\tCMOVQEQ\t"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f42(a uint32) int {
|
|
|
|
|
return bits.TrailingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSFQ\t", "\tORQ\t[^$]", "\tMOVQ\t\\$4294967296,"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f43(a uint16) int {
|
|
|
|
|
return bits.TrailingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSFQ\t", "\tORQ\t\\$65536,"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f44(a uint8) int {
|
|
|
|
|
return bits.TrailingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSFQ\t", "\tORQ\t\\$256,"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
2017-03-15 21:28:29 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f45(a uint64) uint64 {
|
|
|
|
|
return bits.ReverseBytes64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPQ\t"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f46(a uint32) uint32 {
|
|
|
|
|
return bits.ReverseBytes32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSWAPL\t"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f47(a uint16) uint16 {
|
|
|
|
|
return bits.ReverseBytes16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t\\$8,"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
2017-03-16 14:08:31 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f48(a uint64) int {
|
|
|
|
|
return bits.Len64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f49(a uint32) int {
|
|
|
|
|
return bits.Len32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f50(a uint16) int {
|
|
|
|
|
return bits.Len16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
/* see ssa.go
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn:`
|
2017-03-16 14:08:31 -07:00
|
|
|
func f51(a uint8) int {
|
|
|
|
|
return bits.Len8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos:[]string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
*/
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f52(a uint) int {
|
|
|
|
|
return bits.Len(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f53(a uint64) int {
|
|
|
|
|
return bits.LeadingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f54(a uint32) int {
|
|
|
|
|
return bits.LeadingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f55(a uint16) int {
|
|
|
|
|
return bits.LeadingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
/* see ssa.go
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn:`
|
2017-03-16 14:08:31 -07:00
|
|
|
func f56(a uint8) int {
|
|
|
|
|
return bits.LeadingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos:[]string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
*/
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f57(a uint) int {
|
|
|
|
|
return bits.LeadingZeros(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tBSRQ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
2017-03-16 21:33:03 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 21:33:03 -07:00
|
|
|
func pop1(x uint64) int {
|
|
|
|
|
return bits.OnesCount64(x)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tPOPCNTQ\t", "support_popcnt"},
|
2017-03-16 21:33:03 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 21:33:03 -07:00
|
|
|
func pop2(x uint32) int {
|
|
|
|
|
return bits.OnesCount32(x)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tPOPCNTL\t", "support_popcnt"},
|
2017-03-16 21:33:03 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 21:33:03 -07:00
|
|
|
func pop3(x uint16) int {
|
|
|
|
|
return bits.OnesCount16(x)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tPOPCNTL\t", "support_popcnt"},
|
2017-03-16 21:33:03 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 21:33:03 -07:00
|
|
|
func pop4(x uint) int {
|
|
|
|
|
return bits.OnesCount(x)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tPOPCNTQ\t", "support_popcnt"},
|
2017-03-16 21:33:03 -07:00
|
|
|
},
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
// multiplication merging tests
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
func mul1(n int) int {
|
|
|
|
|
return 15*n + 31*n
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tIMULQ\t[$]46"}, // 46*n
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
func mul2(n int) int {
|
|
|
|
|
return 5*n + 7*(n+1) + 11*(n+2)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tIMULQ\t[$]23", "\tADDQ\t[$]29"}, // 23*n + 29
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
func mul3(a, n int) int {
|
|
|
|
|
return a*n + 19*n
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tADDQ\t[$]19", "\tIMULQ"}, // (a+19)*n
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
},
|
2017-08-17 15:06:42 +08:00
|
|
|
{
|
2017-09-03 08:52:34 -07:00
|
|
|
fn: `
|
2017-08-17 15:06:42 +08:00
|
|
|
func mul4(n int) int {
|
|
|
|
|
return 23*n - 9*n
|
|
|
|
|
}`,
|
2017-09-03 08:52:34 -07:00
|
|
|
pos: []string{"\tIMULQ\t[$]14"}, // 14*n
|
2017-08-17 15:06:42 +08:00
|
|
|
},
|
|
|
|
|
{
|
2017-09-03 08:52:34 -07:00
|
|
|
fn: `
|
2017-08-17 15:06:42 +08:00
|
|
|
func mul5(a, n int) int {
|
|
|
|
|
return a*n - 19*n
|
|
|
|
|
}`,
|
2017-09-03 08:52:34 -07:00
|
|
|
pos: []string{"\tADDQ\t[$]-19", "\tIMULQ"}, // (a-19)*n
|
2017-08-17 15:06:42 +08:00
|
|
|
},
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
|
2017-03-18 11:16:30 -07:00
|
|
|
// see issue 19595.
|
|
|
|
|
// We want to merge load+op in f58, but not in f59.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-18 11:16:30 -07:00
|
|
|
func f58(p, q *int) {
|
|
|
|
|
x := *p
|
|
|
|
|
*q += x
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tADDQ\t\\("},
|
2017-03-18 11:16:30 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-18 11:16:30 -07:00
|
|
|
func f59(p, q *int) {
|
|
|
|
|
x := *p
|
|
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
|
*q += x
|
|
|
|
|
}
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tADDQ\t[A-Z]"},
|
2017-03-18 11:16:30 -07:00
|
|
|
},
|
2017-04-03 10:17:48 -07:00
|
|
|
// Floating-point strength reduction
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-03 10:17:48 -07:00
|
|
|
func f60(f float64) float64 {
|
|
|
|
|
return f * 2.0
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tADDSD\t"},
|
2017-04-03 10:17:48 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-03 10:17:48 -07:00
|
|
|
func f62(f float64) float64 {
|
|
|
|
|
return f / 16.0
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMULSD\t"},
|
2017-04-03 10:17:48 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-03 10:17:48 -07:00
|
|
|
func f63(f float64) float64 {
|
|
|
|
|
return f / 0.125
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMULSD\t"},
|
2017-04-03 10:17:48 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-03 10:17:48 -07:00
|
|
|
func f64(f float64) float64 {
|
|
|
|
|
return f / 0.5
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tADDSD\t"},
|
2017-04-03 10:17:48 -07:00
|
|
|
},
|
2017-03-28 15:30:31 -05:00
|
|
|
// Check that compare to constant string uses 2/4/8 byte compares
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-28 15:30:31 -05:00
|
|
|
func f65(a string) bool {
|
|
|
|
|
return a == "xx"
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPW\t[A-Z]"},
|
2017-03-28 15:30:31 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-28 15:30:31 -05:00
|
|
|
func f66(a string) bool {
|
|
|
|
|
return a == "xxxx"
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPL\t[A-Z]"},
|
2017-03-28 15:30:31 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-28 15:30:31 -05:00
|
|
|
func f67(a string) bool {
|
|
|
|
|
return a == "xxxxxxxx"
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPQ\t[A-Z]"},
|
2017-03-28 15:30:31 -05:00
|
|
|
},
|
2017-03-29 10:04:17 -07:00
|
|
|
// Non-constant rotate
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot64l(x uint64, y int) uint64 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 63)
|
|
|
|
|
return x << z | x >> (64-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLQ\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot64r(x uint64, y int) uint64 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 63)
|
|
|
|
|
return x >> z | x << (64-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORQ\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot32l(x uint32, y int) uint32 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 31)
|
|
|
|
|
return x << z | x >> (32-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLL\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot32r(x uint32, y int) uint32 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 31)
|
|
|
|
|
return x >> z | x << (32-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORL\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot16l(x uint16, y int) uint16 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 15)
|
|
|
|
|
return x << z | x >> (16-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLW\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot16r(x uint16, y int) uint16 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 15)
|
|
|
|
|
return x >> z | x << (16-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORW\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot8l(x uint8, y int) uint8 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 7)
|
|
|
|
|
return x << z | x >> (8-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROLB\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `func rot8r(x uint8, y int) uint8 {
|
2017-03-29 10:04:17 -07:00
|
|
|
z := uint(y & 7)
|
|
|
|
|
return x >> z | x << (8-z)
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORB\t"},
|
2017-03-29 10:04:17 -07:00
|
|
|
},
|
2017-04-14 13:53:40 -05:00
|
|
|
// Check that array compare uses 2/4/8 byte compares
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-14 13:53:40 -05:00
|
|
|
func f68(a,b [2]byte) bool {
|
|
|
|
|
return a == b
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPW\t[A-Z]"},
|
2017-04-14 13:53:40 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-14 13:53:40 -05:00
|
|
|
func f69(a,b [3]uint16) bool {
|
|
|
|
|
return a == b
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPL\t[A-Z]"},
|
2017-04-14 13:53:40 -05:00
|
|
|
},
|
2018-02-06 09:44:34 -08:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a,b [3]int16) bool {
|
|
|
|
|
return a == b
|
|
|
|
|
}`,
|
|
|
|
|
pos: []string{"\tCMPL\t[A-Z]"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a,b [12]int8) bool {
|
|
|
|
|
return a == b
|
|
|
|
|
}`,
|
|
|
|
|
pos: []string{"\tCMPQ\t[A-Z]", "\tCMPL\t[A-Z]"},
|
|
|
|
|
},
|
2017-04-14 13:53:40 -05:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-14 13:53:40 -05:00
|
|
|
func f70(a,b [15]byte) bool {
|
|
|
|
|
return a == b
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPQ\t[A-Z]"},
|
2017-04-14 13:53:40 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-14 13:53:40 -05:00
|
|
|
func f71(a,b unsafe.Pointer) bool { // This was a TODO in mapaccess1_faststr
|
|
|
|
|
return *((*[4]byte)(a)) != *((*[4]byte)(b))
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCMPL\t[A-Z]"},
|
2017-04-14 13:53:40 -05:00
|
|
|
},
|
2017-07-18 08:35:00 -04:00
|
|
|
{
|
|
|
|
|
// make sure assembly output has matching offset and base register.
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-07-18 08:35:00 -04:00
|
|
|
func f72(a, b int) int {
|
2017-10-24 21:57:51 -07:00
|
|
|
runtime.GC() // use some frame
|
2017-07-18 08:35:00 -04:00
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-10-24 21:57:51 -07:00
|
|
|
pos: []string{"b\\+24\\(SP\\)"},
|
2017-08-18 14:03:33 -05:00
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
// check load combining
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-18 14:03:33 -05:00
|
|
|
func f73(a, b byte) (byte,byte) {
|
|
|
|
|
return f73(f73(a,b))
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVW\t"},
|
2017-08-18 14:03:33 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-18 14:03:33 -05:00
|
|
|
func f74(a, b uint16) (uint16,uint16) {
|
|
|
|
|
return f74(f74(a,b))
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\t"},
|
2017-08-18 14:03:33 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-18 14:03:33 -05:00
|
|
|
func f75(a, b uint32) (uint32,uint32) {
|
|
|
|
|
return f75(f75(a,b))
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t"},
|
2017-08-18 14:03:33 -05:00
|
|
|
},
|
|
|
|
|
// Make sure we don't put pointers in SSE registers across safe points.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-18 14:03:33 -05:00
|
|
|
func $(p, q *[2]*int) {
|
|
|
|
|
a, b := p[0], p[1]
|
|
|
|
|
runtime.GC()
|
|
|
|
|
q[0], q[1] = a, b
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
neg: []string{"MOVUPS"},
|
2017-07-18 08:35:00 -04:00
|
|
|
},
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]0-8"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
// math.Abs using integer registers
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
func $(x float64) float64 {
|
|
|
|
|
return math.Abs(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,"},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
},
|
|
|
|
|
// math.Copysign using integer registers
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
func $(x, y float64) float64 {
|
|
|
|
|
return math.Copysign(x, y)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,", "\tSHRQ\t[$]63,", "\tSHLQ\t[$]63,", "\tORQ\t"},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
},
|
|
|
|
|
// int <-> fp moves
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
func $(x float64) uint64 {
|
|
|
|
|
return math.Float64bits(x+1) + 1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\tX.*, [^X].*"},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
func $(x float32) uint32 {
|
|
|
|
|
return math.Float32bits(x+1) + 1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\tX.*, [^X].*"},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
func $(x uint64) float64 {
|
|
|
|
|
return math.Float64frombits(x+1) + 1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t[^X].*, X.*"},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
func $(x uint32) float32 {
|
|
|
|
|
return math.Float32frombits(x+1) + 1
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\t[^X].*, X.*"},
|
cmd/compile,math: improve code generation for math.Abs
Implement int reg <-> fp reg moves on amd64.
If we see a load to int reg followed by an int->fp move, then we can just
load to the fp reg instead. Same for stores.
math.Abs is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ AX, "".~r1+16(SP)
math.Copysign is now:
MOVQ "".x+8(SP), AX
SHLQ $1, AX
SHRQ $1, AX
MOVQ "".y+16(SP), CX
SHRQ $63, CX
SHLQ $63, CX
ORQ CX, AX
MOVQ AX, "".~r2+24(SP)
math.Float64bits is now:
MOVSD "".x+8(SP), X0
MOVSD X0, "".~r1+16(SP)
(it would be nicer to use a non-SSE reg for this, nothing is perfect)
And due to the fix for #21440, the inlined version of these improve as well.
name old time/op new time/op delta
Abs 1.38ns ± 5% 0.89ns ±10% -35.54% (p=0.000 n=10+10)
Copysign 1.56ns ± 7% 1.35ns ± 6% -13.77% (p=0.000 n=9+10)
Fixes #13095
Change-Id: Ibd7f2792412a6668608780b0688a77062e1f1499
Reviewed-on: https://go-review.googlesource.com/58732
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2017-08-24 13:19:40 -07:00
|
|
|
},
|
2017-10-03 14:12:00 -05:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x uint32) bool {
|
|
|
|
|
return x > 4
|
|
|
|
|
}
|
|
|
|
|
`,
|
2018-02-18 20:02:17 +01:00
|
|
|
pos: []string{"\tSETHI\t.*\\(SP\\)"},
|
2017-10-03 14:12:00 -05:00
|
|
|
},
|
cmd/compile: optimize signed non-negative div/mod by a power of 2
This CL optimizes assembly for len() or cap() division
by a power of 2 constants:
func lenDiv(s []int) int {
return len(s) / 16
}
amd64 assembly before the CL:
MOVQ "".s+16(SP), AX
MOVQ AX, CX
SARQ $63, AX
SHRQ $60, AX
ADDQ CX, AX
SARQ $4, AX
MOVQ AX, "".~r1+32(SP)
RET
amd64 assembly after the CL:
MOVQ "".s+16(SP), AX
SHRQ $4, AX
MOVQ AX, "".~r1+32(SP)
RET
The CL relies on the fact that len() and cap() result cannot
be negative.
Trigger stats for the added SSA rules on linux/amd64 when running
make.bash:
46 Div64
12 Mod64
The added SSA rules may trigger on more cases in the future
when SSA values will be populated with the info on their
lower bounds.
For instance:
func f(i int16) int16 {
if i < 3 {
return -1
}
// Lower bound of i is 3 here -> i is non-negative,
// so unsigned arithmetics may be used here.
return i % 16
}
Change-Id: I8bc6be5a03e71157ced533c01416451ff6f1a7f0
Reviewed-on: https://go-review.googlesource.com/65530
Reviewed-by: Keith Randall <khr@golang.org>
2017-09-23 00:34:37 +03:00
|
|
|
// Check that len() and cap() div by a constant power of two
|
|
|
|
|
// are compiled into SHRQ.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return len(a) / 1024
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSHRQ\t\\$10,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(s string) int {
|
|
|
|
|
return len(s) / (4097 >> 1)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSHRQ\t\\$11,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return cap(a) / ((1 << 11) + 2048)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSHRQ\t\\$12,"},
|
|
|
|
|
},
|
|
|
|
|
// Check that len() and cap() mod by a constant power of two
|
|
|
|
|
// are compiled into ANDQ.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return len(a) % 1024
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tANDQ\t\\$1023,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(s string) int {
|
|
|
|
|
return len(s) % (4097 >> 1)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tANDQ\t\\$2047,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return cap(a) % ((1 << 11) + 2048)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tANDQ\t\\$4095,"},
|
|
|
|
|
},
|
2017-08-09 14:00:38 -05:00
|
|
|
{
|
|
|
|
|
// Test that small memmove was replaced with direct movs
|
|
|
|
|
fn: `
|
|
|
|
|
func $() {
|
|
|
|
|
x := [...]byte{1, 2, 3, 4, 5, 6, 7}
|
|
|
|
|
copy(x[1:], x[:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"memmove"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
// Same as above but with different size
|
|
|
|
|
fn: `
|
|
|
|
|
func $() {
|
|
|
|
|
x := [...]byte{1, 2, 3, 4}
|
|
|
|
|
copy(x[1:], x[:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"memmove"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
// Same as above but with different size
|
|
|
|
|
fn: `
|
|
|
|
|
func $() {
|
|
|
|
|
x := [...]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
|
|
|
|
|
copy(x[1:], x[:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"memmove"},
|
|
|
|
|
},
|
2017-11-13 19:03:31 -08:00
|
|
|
// Nil checks before calling interface methods
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
type I interface {
|
|
|
|
|
foo000()
|
|
|
|
|
foo001()
|
|
|
|
|
foo002()
|
|
|
|
|
foo003()
|
|
|
|
|
foo004()
|
|
|
|
|
foo005()
|
|
|
|
|
foo006()
|
|
|
|
|
foo007()
|
|
|
|
|
foo008()
|
|
|
|
|
foo009()
|
|
|
|
|
foo010()
|
|
|
|
|
foo011()
|
|
|
|
|
foo012()
|
|
|
|
|
foo013()
|
|
|
|
|
foo014()
|
|
|
|
|
foo015()
|
|
|
|
|
foo016()
|
|
|
|
|
foo017()
|
|
|
|
|
foo018()
|
|
|
|
|
foo019()
|
|
|
|
|
foo020()
|
|
|
|
|
foo021()
|
|
|
|
|
foo022()
|
|
|
|
|
foo023()
|
|
|
|
|
foo024()
|
|
|
|
|
foo025()
|
|
|
|
|
foo026()
|
|
|
|
|
foo027()
|
|
|
|
|
foo028()
|
|
|
|
|
foo029()
|
|
|
|
|
foo030()
|
|
|
|
|
foo031()
|
|
|
|
|
foo032()
|
|
|
|
|
foo033()
|
|
|
|
|
foo034()
|
|
|
|
|
foo035()
|
|
|
|
|
foo036()
|
|
|
|
|
foo037()
|
|
|
|
|
foo038()
|
|
|
|
|
foo039()
|
|
|
|
|
foo040()
|
|
|
|
|
foo041()
|
|
|
|
|
foo042()
|
|
|
|
|
foo043()
|
|
|
|
|
foo044()
|
|
|
|
|
foo045()
|
|
|
|
|
foo046()
|
|
|
|
|
foo047()
|
|
|
|
|
foo048()
|
|
|
|
|
foo049()
|
|
|
|
|
foo050()
|
|
|
|
|
foo051()
|
|
|
|
|
foo052()
|
|
|
|
|
foo053()
|
|
|
|
|
foo054()
|
|
|
|
|
foo055()
|
|
|
|
|
foo056()
|
|
|
|
|
foo057()
|
|
|
|
|
foo058()
|
|
|
|
|
foo059()
|
|
|
|
|
foo060()
|
|
|
|
|
foo061()
|
|
|
|
|
foo062()
|
|
|
|
|
foo063()
|
|
|
|
|
foo064()
|
|
|
|
|
foo065()
|
|
|
|
|
foo066()
|
|
|
|
|
foo067()
|
|
|
|
|
foo068()
|
|
|
|
|
foo069()
|
|
|
|
|
foo070()
|
|
|
|
|
foo071()
|
|
|
|
|
foo072()
|
|
|
|
|
foo073()
|
|
|
|
|
foo074()
|
|
|
|
|
foo075()
|
|
|
|
|
foo076()
|
|
|
|
|
foo077()
|
|
|
|
|
foo078()
|
|
|
|
|
foo079()
|
|
|
|
|
foo080()
|
|
|
|
|
foo081()
|
|
|
|
|
foo082()
|
|
|
|
|
foo083()
|
|
|
|
|
foo084()
|
|
|
|
|
foo085()
|
|
|
|
|
foo086()
|
|
|
|
|
foo087()
|
|
|
|
|
foo088()
|
|
|
|
|
foo089()
|
|
|
|
|
foo090()
|
|
|
|
|
foo091()
|
|
|
|
|
foo092()
|
|
|
|
|
foo093()
|
|
|
|
|
foo094()
|
|
|
|
|
foo095()
|
|
|
|
|
foo096()
|
|
|
|
|
foo097()
|
|
|
|
|
foo098()
|
|
|
|
|
foo099()
|
|
|
|
|
foo100()
|
|
|
|
|
foo101()
|
|
|
|
|
foo102()
|
|
|
|
|
foo103()
|
|
|
|
|
foo104()
|
|
|
|
|
foo105()
|
|
|
|
|
foo106()
|
|
|
|
|
foo107()
|
|
|
|
|
foo108()
|
|
|
|
|
foo109()
|
|
|
|
|
foo110()
|
|
|
|
|
foo111()
|
|
|
|
|
foo112()
|
|
|
|
|
foo113()
|
|
|
|
|
foo114()
|
|
|
|
|
foo115()
|
|
|
|
|
foo116()
|
|
|
|
|
foo117()
|
|
|
|
|
foo118()
|
|
|
|
|
foo119()
|
|
|
|
|
foo120()
|
|
|
|
|
foo121()
|
|
|
|
|
foo122()
|
|
|
|
|
foo123()
|
|
|
|
|
foo124()
|
|
|
|
|
foo125()
|
|
|
|
|
foo126()
|
|
|
|
|
foo127()
|
|
|
|
|
foo128()
|
|
|
|
|
foo129()
|
|
|
|
|
foo130()
|
|
|
|
|
foo131()
|
|
|
|
|
foo132()
|
|
|
|
|
foo133()
|
|
|
|
|
foo134()
|
|
|
|
|
foo135()
|
|
|
|
|
foo136()
|
|
|
|
|
foo137()
|
|
|
|
|
foo138()
|
|
|
|
|
foo139()
|
|
|
|
|
foo140()
|
|
|
|
|
foo141()
|
|
|
|
|
foo142()
|
|
|
|
|
foo143()
|
|
|
|
|
foo144()
|
|
|
|
|
foo145()
|
|
|
|
|
foo146()
|
|
|
|
|
foo147()
|
|
|
|
|
foo148()
|
|
|
|
|
foo149()
|
|
|
|
|
foo150()
|
|
|
|
|
foo151()
|
|
|
|
|
foo152()
|
|
|
|
|
foo153()
|
|
|
|
|
foo154()
|
|
|
|
|
foo155()
|
|
|
|
|
foo156()
|
|
|
|
|
foo157()
|
|
|
|
|
foo158()
|
|
|
|
|
foo159()
|
|
|
|
|
foo160()
|
|
|
|
|
foo161()
|
|
|
|
|
foo162()
|
|
|
|
|
foo163()
|
|
|
|
|
foo164()
|
|
|
|
|
foo165()
|
|
|
|
|
foo166()
|
|
|
|
|
foo167()
|
|
|
|
|
foo168()
|
|
|
|
|
foo169()
|
|
|
|
|
foo170()
|
|
|
|
|
foo171()
|
|
|
|
|
foo172()
|
|
|
|
|
foo173()
|
|
|
|
|
foo174()
|
|
|
|
|
foo175()
|
|
|
|
|
foo176()
|
|
|
|
|
foo177()
|
|
|
|
|
foo178()
|
|
|
|
|
foo179()
|
|
|
|
|
foo180()
|
|
|
|
|
foo181()
|
|
|
|
|
foo182()
|
|
|
|
|
foo183()
|
|
|
|
|
foo184()
|
|
|
|
|
foo185()
|
|
|
|
|
foo186()
|
|
|
|
|
foo187()
|
|
|
|
|
foo188()
|
|
|
|
|
foo189()
|
|
|
|
|
foo190()
|
|
|
|
|
foo191()
|
|
|
|
|
foo192()
|
|
|
|
|
foo193()
|
|
|
|
|
foo194()
|
|
|
|
|
foo195()
|
|
|
|
|
foo196()
|
|
|
|
|
foo197()
|
|
|
|
|
foo198()
|
|
|
|
|
foo199()
|
|
|
|
|
foo200()
|
|
|
|
|
foo201()
|
|
|
|
|
foo202()
|
|
|
|
|
foo203()
|
|
|
|
|
foo204()
|
|
|
|
|
foo205()
|
|
|
|
|
foo206()
|
|
|
|
|
foo207()
|
|
|
|
|
foo208()
|
|
|
|
|
foo209()
|
|
|
|
|
foo210()
|
|
|
|
|
foo211()
|
|
|
|
|
foo212()
|
|
|
|
|
foo213()
|
|
|
|
|
foo214()
|
|
|
|
|
foo215()
|
|
|
|
|
foo216()
|
|
|
|
|
foo217()
|
|
|
|
|
foo218()
|
|
|
|
|
foo219()
|
|
|
|
|
foo220()
|
|
|
|
|
foo221()
|
|
|
|
|
foo222()
|
|
|
|
|
foo223()
|
|
|
|
|
foo224()
|
|
|
|
|
foo225()
|
|
|
|
|
foo226()
|
|
|
|
|
foo227()
|
|
|
|
|
foo228()
|
|
|
|
|
foo229()
|
|
|
|
|
foo230()
|
|
|
|
|
foo231()
|
|
|
|
|
foo232()
|
|
|
|
|
foo233()
|
|
|
|
|
foo234()
|
|
|
|
|
foo235()
|
|
|
|
|
foo236()
|
|
|
|
|
foo237()
|
|
|
|
|
foo238()
|
|
|
|
|
foo239()
|
|
|
|
|
foo240()
|
|
|
|
|
foo241()
|
|
|
|
|
foo242()
|
|
|
|
|
foo243()
|
|
|
|
|
foo244()
|
|
|
|
|
foo245()
|
|
|
|
|
foo246()
|
|
|
|
|
foo247()
|
|
|
|
|
foo248()
|
|
|
|
|
foo249()
|
|
|
|
|
foo250()
|
|
|
|
|
foo251()
|
|
|
|
|
foo252()
|
|
|
|
|
foo253()
|
|
|
|
|
foo254()
|
|
|
|
|
foo255()
|
|
|
|
|
foo256()
|
|
|
|
|
foo257()
|
|
|
|
|
foo258()
|
|
|
|
|
foo259()
|
|
|
|
|
foo260()
|
|
|
|
|
foo261()
|
|
|
|
|
foo262()
|
|
|
|
|
foo263()
|
|
|
|
|
foo264()
|
|
|
|
|
foo265()
|
|
|
|
|
foo266()
|
|
|
|
|
foo267()
|
|
|
|
|
foo268()
|
|
|
|
|
foo269()
|
|
|
|
|
foo270()
|
|
|
|
|
foo271()
|
|
|
|
|
foo272()
|
|
|
|
|
foo273()
|
|
|
|
|
foo274()
|
|
|
|
|
foo275()
|
|
|
|
|
foo276()
|
|
|
|
|
foo277()
|
|
|
|
|
foo278()
|
|
|
|
|
foo279()
|
|
|
|
|
foo280()
|
|
|
|
|
foo281()
|
|
|
|
|
foo282()
|
|
|
|
|
foo283()
|
|
|
|
|
foo284()
|
|
|
|
|
foo285()
|
|
|
|
|
foo286()
|
|
|
|
|
foo287()
|
|
|
|
|
foo288()
|
|
|
|
|
foo289()
|
|
|
|
|
foo290()
|
|
|
|
|
foo291()
|
|
|
|
|
foo292()
|
|
|
|
|
foo293()
|
|
|
|
|
foo294()
|
|
|
|
|
foo295()
|
|
|
|
|
foo296()
|
|
|
|
|
foo297()
|
|
|
|
|
foo298()
|
|
|
|
|
foo299()
|
|
|
|
|
foo300()
|
|
|
|
|
foo301()
|
|
|
|
|
foo302()
|
|
|
|
|
foo303()
|
|
|
|
|
foo304()
|
|
|
|
|
foo305()
|
|
|
|
|
foo306()
|
|
|
|
|
foo307()
|
|
|
|
|
foo308()
|
|
|
|
|
foo309()
|
|
|
|
|
foo310()
|
|
|
|
|
foo311()
|
|
|
|
|
foo312()
|
|
|
|
|
foo313()
|
|
|
|
|
foo314()
|
|
|
|
|
foo315()
|
|
|
|
|
foo316()
|
|
|
|
|
foo317()
|
|
|
|
|
foo318()
|
|
|
|
|
foo319()
|
|
|
|
|
foo320()
|
|
|
|
|
foo321()
|
|
|
|
|
foo322()
|
|
|
|
|
foo323()
|
|
|
|
|
foo324()
|
|
|
|
|
foo325()
|
|
|
|
|
foo326()
|
|
|
|
|
foo327()
|
|
|
|
|
foo328()
|
|
|
|
|
foo329()
|
|
|
|
|
foo330()
|
|
|
|
|
foo331()
|
|
|
|
|
foo332()
|
|
|
|
|
foo333()
|
|
|
|
|
foo334()
|
|
|
|
|
foo335()
|
|
|
|
|
foo336()
|
|
|
|
|
foo337()
|
|
|
|
|
foo338()
|
|
|
|
|
foo339()
|
|
|
|
|
foo340()
|
|
|
|
|
foo341()
|
|
|
|
|
foo342()
|
|
|
|
|
foo343()
|
|
|
|
|
foo344()
|
|
|
|
|
foo345()
|
|
|
|
|
foo346()
|
|
|
|
|
foo347()
|
|
|
|
|
foo348()
|
|
|
|
|
foo349()
|
|
|
|
|
foo350()
|
|
|
|
|
foo351()
|
|
|
|
|
foo352()
|
|
|
|
|
foo353()
|
|
|
|
|
foo354()
|
|
|
|
|
foo355()
|
|
|
|
|
foo356()
|
|
|
|
|
foo357()
|
|
|
|
|
foo358()
|
|
|
|
|
foo359()
|
|
|
|
|
foo360()
|
|
|
|
|
foo361()
|
|
|
|
|
foo362()
|
|
|
|
|
foo363()
|
|
|
|
|
foo364()
|
|
|
|
|
foo365()
|
|
|
|
|
foo366()
|
|
|
|
|
foo367()
|
|
|
|
|
foo368()
|
|
|
|
|
foo369()
|
|
|
|
|
foo370()
|
|
|
|
|
foo371()
|
|
|
|
|
foo372()
|
|
|
|
|
foo373()
|
|
|
|
|
foo374()
|
|
|
|
|
foo375()
|
|
|
|
|
foo376()
|
|
|
|
|
foo377()
|
|
|
|
|
foo378()
|
|
|
|
|
foo379()
|
|
|
|
|
foo380()
|
|
|
|
|
foo381()
|
|
|
|
|
foo382()
|
|
|
|
|
foo383()
|
|
|
|
|
foo384()
|
|
|
|
|
foo385()
|
|
|
|
|
foo386()
|
|
|
|
|
foo387()
|
|
|
|
|
foo388()
|
|
|
|
|
foo389()
|
|
|
|
|
foo390()
|
|
|
|
|
foo391()
|
|
|
|
|
foo392()
|
|
|
|
|
foo393()
|
|
|
|
|
foo394()
|
|
|
|
|
foo395()
|
|
|
|
|
foo396()
|
|
|
|
|
foo397()
|
|
|
|
|
foo398()
|
|
|
|
|
foo399()
|
|
|
|
|
foo400()
|
|
|
|
|
foo401()
|
|
|
|
|
foo402()
|
|
|
|
|
foo403()
|
|
|
|
|
foo404()
|
|
|
|
|
foo405()
|
|
|
|
|
foo406()
|
|
|
|
|
foo407()
|
|
|
|
|
foo408()
|
|
|
|
|
foo409()
|
|
|
|
|
foo410()
|
|
|
|
|
foo411()
|
|
|
|
|
foo412()
|
|
|
|
|
foo413()
|
|
|
|
|
foo414()
|
|
|
|
|
foo415()
|
|
|
|
|
foo416()
|
|
|
|
|
foo417()
|
|
|
|
|
foo418()
|
|
|
|
|
foo419()
|
|
|
|
|
foo420()
|
|
|
|
|
foo421()
|
|
|
|
|
foo422()
|
|
|
|
|
foo423()
|
|
|
|
|
foo424()
|
|
|
|
|
foo425()
|
|
|
|
|
foo426()
|
|
|
|
|
foo427()
|
|
|
|
|
foo428()
|
|
|
|
|
foo429()
|
|
|
|
|
foo430()
|
|
|
|
|
foo431()
|
|
|
|
|
foo432()
|
|
|
|
|
foo433()
|
|
|
|
|
foo434()
|
|
|
|
|
foo435()
|
|
|
|
|
foo436()
|
|
|
|
|
foo437()
|
|
|
|
|
foo438()
|
|
|
|
|
foo439()
|
|
|
|
|
foo440()
|
|
|
|
|
foo441()
|
|
|
|
|
foo442()
|
|
|
|
|
foo443()
|
|
|
|
|
foo444()
|
|
|
|
|
foo445()
|
|
|
|
|
foo446()
|
|
|
|
|
foo447()
|
|
|
|
|
foo448()
|
|
|
|
|
foo449()
|
|
|
|
|
foo450()
|
|
|
|
|
foo451()
|
|
|
|
|
foo452()
|
|
|
|
|
foo453()
|
|
|
|
|
foo454()
|
|
|
|
|
foo455()
|
|
|
|
|
foo456()
|
|
|
|
|
foo457()
|
|
|
|
|
foo458()
|
|
|
|
|
foo459()
|
|
|
|
|
foo460()
|
|
|
|
|
foo461()
|
|
|
|
|
foo462()
|
|
|
|
|
foo463()
|
|
|
|
|
foo464()
|
|
|
|
|
foo465()
|
|
|
|
|
foo466()
|
|
|
|
|
foo467()
|
|
|
|
|
foo468()
|
|
|
|
|
foo469()
|
|
|
|
|
foo470()
|
|
|
|
|
foo471()
|
|
|
|
|
foo472()
|
|
|
|
|
foo473()
|
|
|
|
|
foo474()
|
|
|
|
|
foo475()
|
|
|
|
|
foo476()
|
|
|
|
|
foo477()
|
|
|
|
|
foo478()
|
|
|
|
|
foo479()
|
|
|
|
|
foo480()
|
|
|
|
|
foo481()
|
|
|
|
|
foo482()
|
|
|
|
|
foo483()
|
|
|
|
|
foo484()
|
|
|
|
|
foo485()
|
|
|
|
|
foo486()
|
|
|
|
|
foo487()
|
|
|
|
|
foo488()
|
|
|
|
|
foo489()
|
|
|
|
|
foo490()
|
|
|
|
|
foo491()
|
|
|
|
|
foo492()
|
|
|
|
|
foo493()
|
|
|
|
|
foo494()
|
|
|
|
|
foo495()
|
|
|
|
|
foo496()
|
|
|
|
|
foo497()
|
|
|
|
|
foo498()
|
|
|
|
|
foo499()
|
|
|
|
|
foo500()
|
|
|
|
|
foo501()
|
|
|
|
|
foo502()
|
|
|
|
|
foo503()
|
|
|
|
|
foo504()
|
|
|
|
|
foo505()
|
|
|
|
|
foo506()
|
|
|
|
|
foo507()
|
|
|
|
|
foo508()
|
|
|
|
|
foo509()
|
|
|
|
|
foo510()
|
|
|
|
|
foo511()
|
|
|
|
|
}
|
|
|
|
|
func $(i I) {
|
|
|
|
|
i.foo511()
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"TESTB"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(i I) {
|
|
|
|
|
i.foo001()
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"TESTB"},
|
|
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
2016-12-08 16:17:20 -08:00
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
var linux386Tests = []*asmTest{
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f0(b []byte) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\t\\(.*\\),"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f1(b []byte, i int) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-08-17 21:20:25 +02:00
|
|
|
|
2017-10-05 16:05:03 +02:00
|
|
|
// multiplication by powers of two
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(n int) int {
|
|
|
|
|
return 32*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"SHLL"},
|
|
|
|
|
neg: []string{"IMULL"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(n int) int {
|
|
|
|
|
return -64*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"SHLL"},
|
|
|
|
|
neg: []string{"IMULL"},
|
|
|
|
|
},
|
|
|
|
|
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
// multiplication merging tests
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-17 21:20:25 +02:00
|
|
|
func $(n int) int {
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
return 9*n + 14*n
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tIMULL\t[$]23"}, // 23*n
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-17 21:20:25 +02:00
|
|
|
func $(a, n int) int {
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
return 19*a + a*n
|
|
|
|
|
}`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tADDL\t[$]19", "\tIMULL"}, // (n+19)*a
|
cmd/compile: combine x*n + y*n into (x+y)*n
There are a few cases where this can be useful. Apart from the obvious
(and silly)
100*n + 200*n
where we generate one IMUL instead of two, consider:
15*n + 31*n
Currently, the compiler strength-reduces both imuls, generating:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 MOVQ AX, CX
0x0008 00008 SHLQ $4, AX
0x000c 00012 SUBQ CX, AX
0x000f 00015 MOVQ CX, DX
0x0012 00018 SHLQ $5, CX
0x0016 00022 SUBQ DX, CX
0x0019 00025 ADDQ CX, AX
0x001c 00028 MOVQ AX, "".~r1+16(SP)
0x0021 00033 RET
But combining the imuls is both faster and shorter:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $46, AX
0x0009 00009 MOVQ AX, "".~r1+16(SP)
0x000e 00014 RET
even without strength-reduction.
Moreover, consider:
5*n + 7*(n+1) + 11*(n+2)
We already have a rule that rewrites 7(n+1) into 7n+7, so the
generated code (without imuls merging) looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 LEAQ (AX)(AX*4), CX
0x0009 00009 MOVQ AX, DX
0x000c 00012 NEGQ AX
0x000f 00015 LEAQ (AX)(DX*8), AX
0x0013 00019 ADDQ CX, AX
0x0016 00022 LEAQ (DX)(CX*2), CX
0x001a 00026 LEAQ 29(AX)(CX*1), AX
0x001f 00031 MOVQ AX, "".~r1+16(SP)
But with imuls merging, the 5n, 7n and 11n factors get merged, and the
generated code looks like this:
0x0000 00000 MOVQ "".n+8(SP), AX
0x0005 00005 IMULQ $23, AX
0x0009 00009 ADDQ $29, AX
0x000d 00013 MOVQ AX, "".~r1+16(SP)
0x0012 00018 RET
Which is both faster and shorter; that's also the exact same code that
clang and the intel c compiler generate for the above expression.
Change-Id: Ib4d5503f05d2f2efe31a1be14e2fe6cac33730a9
Reviewed-on: https://go-review.googlesource.com/55143
Reviewed-by: Keith Randall <khr@golang.org>
2017-08-14 11:44:09 +02:00
|
|
|
},
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]0-4"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
2017-08-17 15:06:42 +08:00
|
|
|
{
|
2017-09-03 08:52:34 -07:00
|
|
|
fn: `
|
2017-08-17 15:06:42 +08:00
|
|
|
func mul3(n int) int {
|
|
|
|
|
return 23*n - 9*n
|
|
|
|
|
}`,
|
2017-09-03 08:52:34 -07:00
|
|
|
pos: []string{"\tIMULL\t[$]14"}, // 14*n
|
2017-08-17 15:06:42 +08:00
|
|
|
},
|
|
|
|
|
{
|
2017-09-03 08:52:34 -07:00
|
|
|
fn: `
|
2017-08-17 15:06:42 +08:00
|
|
|
func mul4(a, n int) int {
|
|
|
|
|
return n*a - a*19
|
|
|
|
|
}`,
|
2017-09-03 08:52:34 -07:00
|
|
|
pos: []string{"\tADDL\t[$]-19", "\tIMULL"}, // (n-19)*a
|
2017-08-17 15:06:42 +08:00
|
|
|
},
|
cmd/compile: optimize signed non-negative div/mod by a power of 2
This CL optimizes assembly for len() or cap() division
by a power of 2 constants:
func lenDiv(s []int) int {
return len(s) / 16
}
amd64 assembly before the CL:
MOVQ "".s+16(SP), AX
MOVQ AX, CX
SARQ $63, AX
SHRQ $60, AX
ADDQ CX, AX
SARQ $4, AX
MOVQ AX, "".~r1+32(SP)
RET
amd64 assembly after the CL:
MOVQ "".s+16(SP), AX
SHRQ $4, AX
MOVQ AX, "".~r1+32(SP)
RET
The CL relies on the fact that len() and cap() result cannot
be negative.
Trigger stats for the added SSA rules on linux/amd64 when running
make.bash:
46 Div64
12 Mod64
The added SSA rules may trigger on more cases in the future
when SSA values will be populated with the info on their
lower bounds.
For instance:
func f(i int16) int16 {
if i < 3 {
return -1
}
// Lower bound of i is 3 here -> i is non-negative,
// so unsigned arithmetics may be used here.
return i % 16
}
Change-Id: I8bc6be5a03e71157ced533c01416451ff6f1a7f0
Reviewed-on: https://go-review.googlesource.com/65530
Reviewed-by: Keith Randall <khr@golang.org>
2017-09-23 00:34:37 +03:00
|
|
|
// Check that len() and cap() div by a constant power of two
|
|
|
|
|
// are compiled into SHRL.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return len(a) / 1024
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSHRL\t\\$10,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(s string) int {
|
|
|
|
|
return len(s) / (4097 >> 1)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSHRL\t\\$11,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return cap(a) / ((1 << 11) + 2048)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSHRL\t\\$12,"},
|
|
|
|
|
},
|
|
|
|
|
// Check that len() and cap() mod by a constant power of two
|
|
|
|
|
// are compiled into ANDL.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return len(a) % 1024
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tANDL\t\\$1023,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(s string) int {
|
|
|
|
|
return len(s) % (4097 >> 1)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tANDL\t\\$2047,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a []int) int {
|
|
|
|
|
return cap(a) % ((1 << 11) + 2048)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tANDL\t\\$4095,"},
|
|
|
|
|
},
|
2017-08-09 14:00:38 -05:00
|
|
|
{
|
|
|
|
|
// Test that small memmove was replaced with direct movs
|
|
|
|
|
fn: `
|
|
|
|
|
func $() {
|
|
|
|
|
x := [...]byte{1, 2, 3, 4, 5, 6, 7}
|
|
|
|
|
copy(x[1:], x[:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"memmove"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
// Same as above but with different size
|
|
|
|
|
fn: `
|
|
|
|
|
func $() {
|
|
|
|
|
x := [...]byte{1, 2, 3, 4}
|
|
|
|
|
copy(x[1:], x[:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"memmove"},
|
|
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var linuxS390XTests = []*asmTest{
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f0(b []byte) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVWBR\t\\(.*\\),"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f1(b []byte, i int) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVWBR\t\\(.*\\)\\(.*\\*1\\),"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f2(b []byte) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVDBR\t\\(.*\\),"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f3(b []byte, i int) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVDBR\t\\(.*\\)\\(.*\\*1\\),"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f4(b []byte) uint32 {
|
|
|
|
|
return binary.BigEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVWZ\t\\(.*\\),"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f5(b []byte, i int) uint32 {
|
|
|
|
|
return binary.BigEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVWZ\t\\(.*\\)\\(.*\\*1\\),"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f6(b []byte) uint64 {
|
|
|
|
|
return binary.BigEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVD\t\\(.*\\),"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f7(b []byte, i int) uint64 {
|
|
|
|
|
return binary.BigEndian.Uint64(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVD\t\\(.*\\)\\(.*\\*1\\),"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f8(x uint64) uint64 {
|
|
|
|
|
return x<<7 + x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRLLG\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f9(x uint64) uint64 {
|
|
|
|
|
return x<<7 | x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRLLG\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f10(x uint64) uint64 {
|
|
|
|
|
return x<<7 ^ x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRLLG\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f11(x uint32) uint32 {
|
|
|
|
|
return x<<7 + x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f12(x uint32) uint32 {
|
|
|
|
|
return x<<7 | x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f13(x uint32) uint32 {
|
|
|
|
|
return x<<7 ^ x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRLL\t[$]7,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-02-12 22:12:12 -05:00
|
|
|
// Fused multiply-add/sub instructions.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-12 22:12:12 -05:00
|
|
|
func f14(x, y, z float64) float64 {
|
|
|
|
|
return x * y + z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMADD\t"},
|
2017-02-12 22:12:12 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-12 22:12:12 -05:00
|
|
|
func f15(x, y, z float64) float64 {
|
|
|
|
|
return x * y - z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMSUB\t"},
|
2017-02-12 22:12:12 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-12 22:12:12 -05:00
|
|
|
func f16(x, y, z float32) float32 {
|
|
|
|
|
return x * y + z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMADDS\t"},
|
2017-02-12 22:12:12 -05:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-12 22:12:12 -05:00
|
|
|
func f17(x, y, z float32) float32 {
|
|
|
|
|
return x * y - z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMSUBS\t"},
|
2017-02-12 22:12:12 -05:00
|
|
|
},
|
2017-03-14 13:25:12 -07:00
|
|
|
// Intrinsic tests for math/bits
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f18(a uint64) int {
|
|
|
|
|
return bits.TrailingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f19(a uint32) int {
|
|
|
|
|
return bits.TrailingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t", "\tMOVWZ\t"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f20(a uint16) int {
|
|
|
|
|
return bits.TrailingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t", "\tOR\t\\$65536,"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-14 13:25:12 -07:00
|
|
|
func f21(a uint8) int {
|
|
|
|
|
return bits.TrailingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t", "\tOR\t\\$256,"},
|
2017-03-14 13:25:12 -07:00
|
|
|
},
|
2017-03-15 21:28:29 -07:00
|
|
|
// Intrinsic tests for math/bits
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f22(a uint64) uint64 {
|
|
|
|
|
return bits.ReverseBytes64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVDBR\t"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f23(a uint32) uint32 {
|
|
|
|
|
return bits.ReverseBytes32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVWBR\t"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
2017-03-16 14:08:31 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f24(a uint64) int {
|
|
|
|
|
return bits.Len64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f25(a uint32) int {
|
|
|
|
|
return bits.Len32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f26(a uint16) int {
|
|
|
|
|
return bits.Len16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f27(a uint8) int {
|
|
|
|
|
return bits.Len8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f28(a uint) int {
|
|
|
|
|
return bits.Len(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f29(a uint64) int {
|
|
|
|
|
return bits.LeadingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f30(a uint32) int {
|
|
|
|
|
return bits.LeadingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f31(a uint16) int {
|
|
|
|
|
return bits.LeadingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f32(a uint8) int {
|
|
|
|
|
return bits.LeadingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f33(a uint) int {
|
|
|
|
|
return bits.LeadingZeros(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFLOGR\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
2017-09-14 20:00:02 +01:00
|
|
|
// Intrinsic tests for math.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func ceil(x float64) float64 {
|
|
|
|
|
return math.Ceil(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFIDBR\t[$]6"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func floor(x float64) float64 {
|
|
|
|
|
return math.Floor(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFIDBR\t[$]7"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func round(x float64) float64 {
|
|
|
|
|
return math.Round(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFIDBR\t[$]1"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func trunc(x float64) float64 {
|
|
|
|
|
return math.Trunc(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFIDBR\t[$]5"},
|
|
|
|
|
},
|
2017-10-30 09:02:44 -04:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func roundToEven(x float64) float64 {
|
|
|
|
|
return math.RoundToEven(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFIDBR\t[$]4"},
|
|
|
|
|
},
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]0-8"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
2017-09-08 01:31:13 +01:00
|
|
|
// Constant propagation through raw bits conversions.
|
|
|
|
|
{
|
|
|
|
|
// uint32 constant converted to float32 constant
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float32) float32 {
|
|
|
|
|
if x > math.Float32frombits(0x3f800000) {
|
|
|
|
|
return -x
|
|
|
|
|
}
|
|
|
|
|
return x
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFMOVS\t[$]f32.3f800000\\(SB\\)"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
// float32 constant converted to uint32 constant
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x uint32) uint32 {
|
|
|
|
|
if x > math.Float32bits(1) {
|
|
|
|
|
return -x
|
|
|
|
|
}
|
|
|
|
|
return x
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"\tFMOVS\t"},
|
|
|
|
|
},
|
2017-09-14 11:04:37 +01:00
|
|
|
// Constant propagation through float comparisons.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $() bool {
|
|
|
|
|
return 0.5 == float64(uint32(1)) ||
|
|
|
|
|
1.5 > float64(uint64(1<<63)) ||
|
|
|
|
|
math.NaN() == math.NaN()
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOV(B|BZ|D)\t[$]0,"},
|
|
|
|
|
neg: []string{"\tFCMPU\t", "\tMOV(B|BZ|D)\t[$]1,"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $() bool {
|
|
|
|
|
return float32(0.5) <= float32(int64(1)) &&
|
|
|
|
|
float32(1.5) >= float32(int32(-1<<31)) &&
|
|
|
|
|
float32(math.NaN()) != float32(math.NaN())
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOV(B|BZ|D)\t[$]1,"},
|
|
|
|
|
neg: []string{"\tCEBR\t", "\tMOV(B|BZ|D)\t[$]0,"},
|
|
|
|
|
},
|
2017-10-27 09:45:45 -04:00
|
|
|
// math tests
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float64) float64 {
|
|
|
|
|
return math.Abs(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLPDFR\t"},
|
|
|
|
|
neg: []string{"\tMOVD\t"}, // no integer loads/stores
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float32) float32 {
|
|
|
|
|
return float32(math.Abs(float64(x)))
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLPDFR\t"},
|
|
|
|
|
neg: []string{"\tLDEBR\t", "\tLEDBR\t"}, // no float64 conversion
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float64) float64 {
|
|
|
|
|
return math.Float64frombits(math.Float64bits(x)|1<<63)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLNDFR\t"},
|
|
|
|
|
neg: []string{"\tMOVD\t"}, // no integer loads/stores
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float64) float64 {
|
|
|
|
|
return -math.Abs(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLNDFR\t"},
|
|
|
|
|
neg: []string{"\tMOVD\t"}, // no integer loads/stores
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x, y float64) float64 {
|
|
|
|
|
return math.Copysign(x, y)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tCPSDR\t"},
|
|
|
|
|
neg: []string{"\tMOVD\t"}, // no integer loads/stores
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float64) float64 {
|
|
|
|
|
return math.Copysign(x, -1)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLNDFR\t"},
|
|
|
|
|
neg: []string{"\tMOVD\t"}, // no integer loads/stores
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float64) float64 {
|
|
|
|
|
return math.Copysign(-1, x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tCPSDR\t"},
|
|
|
|
|
neg: []string{"\tMOVD\t"}, // no integer loads/stores
|
|
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
2016-12-08 16:17:20 -08:00
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
var linuxARMTests = []*asmTest{
|
2017-11-03 11:11:27 +01:00
|
|
|
// multiplication by powers of two
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(n int) int {
|
|
|
|
|
return 16*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSLL\t[$]4"},
|
|
|
|
|
neg: []string{"\tMUL\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(n int) int {
|
|
|
|
|
return -32*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tSLL\t[$]5"},
|
|
|
|
|
neg: []string{"\tMUL\t"},
|
|
|
|
|
},
|
|
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f0(x uint32) uint32 {
|
|
|
|
|
return x<<7 + x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVW\tR[0-9]+@>25,"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f1(x uint32) uint32 {
|
|
|
|
|
return x<<7 | x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVW\tR[0-9]+@>25,"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f2(x uint32) uint32 {
|
|
|
|
|
return x<<7 ^ x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVW\tR[0-9]+@>25,"},
|
2016-12-08 16:17:20 -08:00
|
|
|
},
|
2017-03-16 14:08:31 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f3(a uint64) int {
|
|
|
|
|
return bits.Len64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f4(a uint32) int {
|
|
|
|
|
return bits.Len32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f5(a uint16) int {
|
|
|
|
|
return bits.Len16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f6(a uint8) int {
|
|
|
|
|
return bits.Len8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f7(a uint) int {
|
|
|
|
|
return bits.Len(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f8(a uint64) int {
|
|
|
|
|
return bits.LeadingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f9(a uint32) int {
|
|
|
|
|
return bits.LeadingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f10(a uint16) int {
|
|
|
|
|
return bits.LeadingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f11(a uint8) int {
|
|
|
|
|
return bits.LeadingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f12(a uint) int {
|
|
|
|
|
return bits.LeadingZeros(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
2017-07-18 08:35:00 -04:00
|
|
|
{
|
|
|
|
|
// make sure assembly output has matching offset and base register.
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-07-18 08:35:00 -04:00
|
|
|
func f13(a, b int) int {
|
2017-10-24 21:57:51 -07:00
|
|
|
runtime.GC() // use some frame
|
2017-07-18 08:35:00 -04:00
|
|
|
return b
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"b\\+4\\(FP\\)"},
|
2017-07-18 08:35:00 -04:00
|
|
|
},
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]-4-4"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
}
|
2017-02-09 14:00:23 -08:00
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
var linuxARM64Tests = []*asmTest{
|
2017-11-03 11:11:27 +01:00
|
|
|
// multiplication by powers of two
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(n int) int {
|
|
|
|
|
return 64*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLSL\t[$]6"},
|
|
|
|
|
neg: []string{"\tMUL\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(n int) int {
|
|
|
|
|
return -128*n
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tLSL\t[$]7"},
|
|
|
|
|
neg: []string{"\tMUL\t"},
|
|
|
|
|
},
|
|
|
|
|
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f0(x uint64) uint64 {
|
|
|
|
|
return x<<7 + x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROR\t[$]57,"},
|
2017-02-09 14:00:23 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f1(x uint64) uint64 {
|
|
|
|
|
return x<<7 | x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROR\t[$]57,"},
|
2017-02-09 14:00:23 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f2(x uint64) uint64 {
|
|
|
|
|
return x<<7 ^ x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROR\t[$]57,"},
|
2017-02-09 14:00:23 -08:00
|
|
|
},
|
2017-02-20 17:17:28 +01:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f3(x uint32) uint32 {
|
|
|
|
|
return x<<7 + x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORW\t[$]25,"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f4(x uint32) uint32 {
|
|
|
|
|
return x<<7 | x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORW\t[$]25,"},
|
2017-02-20 17:17:28 +01:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-02-20 17:17:28 +01:00
|
|
|
func f5(x uint32) uint32 {
|
|
|
|
|
return x<<7 ^ x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tRORW\t[$]25,"},
|
2017-02-09 14:00:23 -08:00
|
|
|
},
|
2017-03-15 21:28:29 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f22(a uint64) uint64 {
|
|
|
|
|
return bits.ReverseBytes64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tREV\t"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-15 21:28:29 -07:00
|
|
|
func f23(a uint32) uint32 {
|
|
|
|
|
return bits.ReverseBytes32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tREVW\t"},
|
2017-03-15 21:28:29 -07:00
|
|
|
},
|
2017-03-16 14:08:31 -07:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f24(a uint64) int {
|
|
|
|
|
return bits.Len64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f25(a uint32) int {
|
|
|
|
|
return bits.Len32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f26(a uint16) int {
|
|
|
|
|
return bits.Len16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f27(a uint8) int {
|
|
|
|
|
return bits.Len8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f28(a uint) int {
|
|
|
|
|
return bits.Len(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f29(a uint64) int {
|
|
|
|
|
return bits.LeadingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f30(a uint32) int {
|
|
|
|
|
return bits.LeadingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f31(a uint16) int {
|
|
|
|
|
return bits.LeadingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f32(a uint8) int {
|
|
|
|
|
return bits.LeadingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f33(a uint) int {
|
|
|
|
|
return bits.LeadingZeros(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
2017-04-06 09:36:23 -04:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-06 09:36:23 -04:00
|
|
|
func f34(a uint64) uint64 {
|
|
|
|
|
return a & ((1<<63)-1)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tAND\t"},
|
2017-04-06 09:36:23 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-06 09:36:23 -04:00
|
|
|
func f35(a uint64) uint64 {
|
|
|
|
|
return a & (1<<63)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tAND\t"},
|
2017-04-06 09:36:23 -04:00
|
|
|
},
|
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64
ARM64 assembler backend only accepts loads and stores with small
or aligned offset. The compiler therefore can only fold small or
aligned offsets into loads and stores. For locals and args, their
offsets to SP are not known until very late, and the compiler
makes conservative decision not folding some of them. However,
in most cases, the offset is indeed small or aligned, and can
be folded into load and store (but actually not).
This CL adds support of loads and stores with large and unaligned
offsets. When the offset doesn't fit into the instruction, it
uses two instructions and (for very large offset) the constant
pool. This way, the compiler doesn't need to be conservative,
and can simply fold the offset.
To make it work, the assembler's optab matching rules need to be
changed. Before, MOVD accepts C_UAUTO32K which matches multiple
of 8 between 0 and 32K, and also C_UAUTO16K, which may not be
multiple of 8 and does not fit into MOVD instruction. The
assembler errors in the latter case. This change makes it only
matches multiple of 8 (or offsets within ±256, which also fits
in instruction), and uses the large-or-unaligned-offset rule
for things doesn't fit (without error). Other sized move rules
are changed similarly.
Class C_UAUTO64K and C_UOREG64K are removed, as they are never
used.
In shared library, load/store of global is rewritten to using
GOT and temp register, which conflicts with the use of temp
register for assembling large offset. So the folding is disabled
for globals in shared library mode.
Reduce cmd/go binary size by 2%.
name old time/op new time/op delta
BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10)
Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9)
FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal)
FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10)
FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10)
FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8)
FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10)
FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10)
FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10)
GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10)
GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10)
Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10)
Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10)
HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10)
JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9)
JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8)
Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9)
GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10)
RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9)
RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9)
RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9)
RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9)
RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9)
RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9)
RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8)
RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9)
Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8)
Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10)
TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10)
TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10)
[Geo mean] 145µs 143µs -1.79%
Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985
Reviewed-on: https://go-review.googlesource.com/42172
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
2017-04-28 18:02:00 -04:00
|
|
|
{
|
|
|
|
|
// make sure offsets are folded into load and store.
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64
ARM64 assembler backend only accepts loads and stores with small
or aligned offset. The compiler therefore can only fold small or
aligned offsets into loads and stores. For locals and args, their
offsets to SP are not known until very late, and the compiler
makes conservative decision not folding some of them. However,
in most cases, the offset is indeed small or aligned, and can
be folded into load and store (but actually not).
This CL adds support of loads and stores with large and unaligned
offsets. When the offset doesn't fit into the instruction, it
uses two instructions and (for very large offset) the constant
pool. This way, the compiler doesn't need to be conservative,
and can simply fold the offset.
To make it work, the assembler's optab matching rules need to be
changed. Before, MOVD accepts C_UAUTO32K which matches multiple
of 8 between 0 and 32K, and also C_UAUTO16K, which may not be
multiple of 8 and does not fit into MOVD instruction. The
assembler errors in the latter case. This change makes it only
matches multiple of 8 (or offsets within ±256, which also fits
in instruction), and uses the large-or-unaligned-offset rule
for things doesn't fit (without error). Other sized move rules
are changed similarly.
Class C_UAUTO64K and C_UOREG64K are removed, as they are never
used.
In shared library, load/store of global is rewritten to using
GOT and temp register, which conflicts with the use of temp
register for assembling large offset. So the folding is disabled
for globals in shared library mode.
Reduce cmd/go binary size by 2%.
name old time/op new time/op delta
BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10)
Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9)
FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal)
FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10)
FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10)
FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8)
FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10)
FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10)
FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10)
GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10)
GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10)
Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10)
Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10)
HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10)
JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9)
JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8)
Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9)
GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10)
RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9)
RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9)
RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9)
RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9)
RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9)
RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9)
RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8)
RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9)
Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8)
Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10)
TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10)
TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10)
[Geo mean] 145µs 143µs -1.79%
Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985
Reviewed-on: https://go-review.googlesource.com/42172
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
2017-04-28 18:02:00 -04:00
|
|
|
func f36(_, a [20]byte) (b [20]byte) {
|
|
|
|
|
b = a
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVD\t\"\"\\.a\\+[0-9]+\\(FP\\), R[0-9]+", "\tMOVD\tR[0-9]+, \"\"\\.b\\+[0-9]+\\(FP\\)"},
|
cmd/internal/obj/arm64, cmd/compile: improve offset folding on ARM64
ARM64 assembler backend only accepts loads and stores with small
or aligned offset. The compiler therefore can only fold small or
aligned offsets into loads and stores. For locals and args, their
offsets to SP are not known until very late, and the compiler
makes conservative decision not folding some of them. However,
in most cases, the offset is indeed small or aligned, and can
be folded into load and store (but actually not).
This CL adds support of loads and stores with large and unaligned
offsets. When the offset doesn't fit into the instruction, it
uses two instructions and (for very large offset) the constant
pool. This way, the compiler doesn't need to be conservative,
and can simply fold the offset.
To make it work, the assembler's optab matching rules need to be
changed. Before, MOVD accepts C_UAUTO32K which matches multiple
of 8 between 0 and 32K, and also C_UAUTO16K, which may not be
multiple of 8 and does not fit into MOVD instruction. The
assembler errors in the latter case. This change makes it only
matches multiple of 8 (or offsets within ±256, which also fits
in instruction), and uses the large-or-unaligned-offset rule
for things doesn't fit (without error). Other sized move rules
are changed similarly.
Class C_UAUTO64K and C_UOREG64K are removed, as they are never
used.
In shared library, load/store of global is rewritten to using
GOT and temp register, which conflicts with the use of temp
register for assembling large offset. So the folding is disabled
for globals in shared library mode.
Reduce cmd/go binary size by 2%.
name old time/op new time/op delta
BinaryTree17-8 8.67s ± 0% 8.61s ± 0% -0.60% (p=0.000 n=9+10)
Fannkuch11-8 6.24s ± 0% 6.19s ± 0% -0.83% (p=0.000 n=10+9)
FmtFprintfEmpty-8 116ns ± 0% 116ns ± 0% ~ (all equal)
FmtFprintfString-8 196ns ± 0% 192ns ± 0% -1.89% (p=0.000 n=10+10)
FmtFprintfInt-8 199ns ± 0% 198ns ± 0% -0.35% (p=0.001 n=9+10)
FmtFprintfIntInt-8 294ns ± 0% 293ns ± 0% -0.34% (p=0.000 n=8+8)
FmtFprintfPrefixedInt-8 318ns ± 1% 318ns ± 1% ~ (p=1.000 n=10+10)
FmtFprintfFloat-8 537ns ± 0% 531ns ± 0% -1.17% (p=0.000 n=9+10)
FmtManyArgs-8 1.19µs ± 1% 1.18µs ± 1% -1.41% (p=0.001 n=10+10)
GobDecode-8 17.2ms ± 1% 17.3ms ± 2% ~ (p=0.165 n=10+10)
GobEncode-8 14.7ms ± 1% 14.7ms ± 2% ~ (p=0.631 n=10+10)
Gzip-8 837ms ± 0% 836ms ± 0% -0.14% (p=0.006 n=9+10)
Gunzip-8 141ms ± 0% 139ms ± 0% -1.24% (p=0.000 n=9+10)
HTTPClientServer-8 256µs ± 1% 253µs ± 1% -1.35% (p=0.000 n=10+10)
JSONEncode-8 40.1ms ± 1% 41.3ms ± 1% +3.06% (p=0.000 n=10+9)
JSONDecode-8 157ms ± 1% 156ms ± 1% -0.83% (p=0.001 n=9+8)
Mandelbrot200-8 8.94ms ± 0% 8.94ms ± 0% +0.02% (p=0.000 n=9+9)
GoParse-8 8.69ms ± 0% 8.54ms ± 1% -1.69% (p=0.000 n=8+10)
RegexpMatchEasy0_32-8 227ns ± 1% 228ns ± 1% +0.48% (p=0.016 n=10+9)
RegexpMatchEasy0_1K-8 1.92µs ± 0% 1.63µs ± 0% -15.08% (p=0.000 n=10+9)
RegexpMatchEasy1_32-8 256ns ± 0% 251ns ± 0% -2.19% (p=0.000 n=10+9)
RegexpMatchEasy1_1K-8 2.38µs ± 0% 2.09µs ± 0% -12.49% (p=0.000 n=10+9)
RegexpMatchMedium_32-8 352ns ± 0% 354ns ± 0% +0.39% (p=0.002 n=10+9)
RegexpMatchMedium_1K-8 106µs ± 0% 106µs ± 0% -0.05% (p=0.005 n=10+9)
RegexpMatchHard_32-8 5.92µs ± 0% 5.89µs ± 0% -0.40% (p=0.000 n=9+8)
RegexpMatchHard_1K-8 180µs ± 0% 179µs ± 0% -0.14% (p=0.000 n=10+9)
Revcomp-8 1.20s ± 0% 1.13s ± 0% -6.29% (p=0.000 n=9+8)
Template-8 159ms ± 1% 154ms ± 1% -3.14% (p=0.000 n=9+10)
TimeParse-8 800ns ± 3% 769ns ± 1% -3.91% (p=0.000 n=10+10)
TimeFormat-8 826ns ± 2% 817ns ± 2% -1.04% (p=0.050 n=10+10)
[Geo mean] 145µs 143µs -1.79%
Change-Id: I5fc42087cee9b54ea414f8ef6d6d020b80eb5985
Reviewed-on: https://go-review.googlesource.com/42172
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: David Chase <drchase@google.com>
2017-04-28 18:02:00 -04:00
|
|
|
},
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]-8-8"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
2017-08-30 12:11:29 -04:00
|
|
|
{
|
|
|
|
|
// check that we don't emit comparisons for constant shift
|
|
|
|
|
fn: `
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func $(x int) int {
|
|
|
|
|
return x << 17
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"LSL\t\\$17"},
|
|
|
|
|
neg: []string{"CMP"},
|
|
|
|
|
},
|
2018-02-07 15:37:33 -05:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a int32, ptr *int) {
|
|
|
|
|
if a >= 0 {
|
|
|
|
|
*ptr = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"TBNZ"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a int64, ptr *int) {
|
|
|
|
|
if a >= 0 {
|
|
|
|
|
*ptr = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"TBNZ"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a int32, ptr *int) {
|
|
|
|
|
if a < 0 {
|
|
|
|
|
*ptr = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"TBZ"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(a int64, ptr *int) {
|
|
|
|
|
if a < 0 {
|
|
|
|
|
*ptr = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"TBZ"},
|
|
|
|
|
},
|
2018-01-30 12:16:52 -05:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x uint64) int {
|
|
|
|
|
return bits.OnesCount64(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tVCNT\t", "\tVUADDLV\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x uint32) int {
|
|
|
|
|
return bits.OnesCount32(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tVCNT\t", "\tVUADDLV\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x uint16) int {
|
|
|
|
|
return bits.OnesCount16(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tVCNT\t", "\tVUADDLV\t"},
|
|
|
|
|
},
|
2018-02-15 14:49:03 -05:00
|
|
|
// Load-combining tests.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVD\t\\(R[0-9]+\\)"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte, i int) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVD\t\\(R[0-9]+\\)"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVWU\t\\(R[0-9]+\\)"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte, i int) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVWU\t\\(R[0-9]+\\)"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte) uint64 {
|
|
|
|
|
return binary.BigEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tREV\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte, i int) uint64 {
|
|
|
|
|
return binary.BigEndian.Uint64(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tREV\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte) uint32 {
|
|
|
|
|
return binary.BigEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tREVW\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(b []byte, i int) uint32 {
|
|
|
|
|
return binary.BigEndian.Uint32(b[i:])
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tREVW\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func $(s []byte) uint16 {
|
|
|
|
|
return uint16(s[0]) | uint16(s[1]) << 8
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVHU\t\\(R[0-9]+\\)"},
|
|
|
|
|
neg: []string{"ORR\tR[0-9]+<<8\t"},
|
|
|
|
|
},
|
2018-02-16 09:22:32 -05:00
|
|
|
// Intrinsic tests for math.
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func sqrt(x float64) float64 {
|
|
|
|
|
return math.Sqrt(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"FSQRTD"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func ceil(x float64) float64 {
|
|
|
|
|
return math.Ceil(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"FRINTPD"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func floor(x float64) float64 {
|
|
|
|
|
return math.Floor(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"FRINTMD"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func round(x float64) float64 {
|
|
|
|
|
return math.Round(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"FRINTAD"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func trunc(x float64) float64 {
|
|
|
|
|
return math.Trunc(x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"FRINTZD"},
|
|
|
|
|
},
|
2017-08-13 22:36:47 +00:00
|
|
|
{
|
|
|
|
|
// make sure that CSEL is emitted for conditional moves
|
|
|
|
|
fn: `
|
|
|
|
|
func f37(c int) int {
|
|
|
|
|
x := c + 4
|
|
|
|
|
if c < 0 {
|
|
|
|
|
x = 182
|
|
|
|
|
}
|
|
|
|
|
return x
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tCSEL\t"},
|
|
|
|
|
},
|
2017-03-16 14:08:31 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var linuxMIPSTests = []*asmTest{
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f0(a uint64) int {
|
|
|
|
|
return bits.Len64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f1(a uint32) int {
|
|
|
|
|
return bits.Len32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f2(a uint16) int {
|
|
|
|
|
return bits.Len16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f3(a uint8) int {
|
|
|
|
|
return bits.Len8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f4(a uint) int {
|
|
|
|
|
return bits.Len(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f5(a uint64) int {
|
|
|
|
|
return bits.LeadingZeros64(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f6(a uint32) int {
|
|
|
|
|
return bits.LeadingZeros32(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f7(a uint16) int {
|
|
|
|
|
return bits.LeadingZeros16(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f8(a uint8) int {
|
|
|
|
|
return bits.LeadingZeros8(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-16 14:08:31 -07:00
|
|
|
func f9(a uint) int {
|
|
|
|
|
return bits.LeadingZeros(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tCLZ\t"},
|
2017-03-16 14:08:31 -07:00
|
|
|
},
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]-4-4"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
2016-05-24 14:09:02 -07:00
|
|
|
}
|
2016-06-01 15:13:55 +02:00
|
|
|
|
2017-08-30 12:11:29 -04:00
|
|
|
var linuxMIPS64Tests = []*asmTest{
|
|
|
|
|
{
|
|
|
|
|
// check that we don't emit comparisons for constant shift
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x int) int {
|
|
|
|
|
return x << 17
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"SLLV\t\\$17"},
|
|
|
|
|
neg: []string{"SGT"},
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
2017-03-13 14:39:17 -04:00
|
|
|
var linuxPPC64LETests = []*asmTest{
|
|
|
|
|
// Fused multiply-add/sub instructions.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-13 14:39:17 -04:00
|
|
|
func f0(x, y, z float64) float64 {
|
|
|
|
|
return x * y + z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMADD\t"},
|
2017-03-13 14:39:17 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-13 14:39:17 -04:00
|
|
|
func f1(x, y, z float64) float64 {
|
|
|
|
|
return x * y - z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMSUB\t"},
|
2017-03-13 14:39:17 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-13 14:39:17 -04:00
|
|
|
func f2(x, y, z float32) float32 {
|
|
|
|
|
return x * y + z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMADDS\t"},
|
2017-03-13 14:39:17 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-13 14:39:17 -04:00
|
|
|
func f3(x, y, z float32) float32 {
|
|
|
|
|
return x * y - z
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tFMSUBS\t"},
|
2017-03-13 14:39:17 -04:00
|
|
|
},
|
2017-04-18 17:05:31 -04:00
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-18 17:05:31 -04:00
|
|
|
func f4(x uint32) uint32 {
|
|
|
|
|
return x<<7 | x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROTLW\t"},
|
2017-04-18 17:05:31 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-18 17:05:31 -04:00
|
|
|
func f5(x uint32) uint32 {
|
|
|
|
|
return x<<7 + x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROTLW\t"},
|
2017-04-18 17:05:31 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-18 17:05:31 -04:00
|
|
|
func f6(x uint32) uint32 {
|
|
|
|
|
return x<<7 ^ x>>25
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROTLW\t"},
|
2017-04-18 17:05:31 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-18 17:05:31 -04:00
|
|
|
func f7(x uint64) uint64 {
|
|
|
|
|
return x<<7 | x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROTL\t"},
|
2017-04-18 17:05:31 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-18 17:05:31 -04:00
|
|
|
func f8(x uint64) uint64 {
|
|
|
|
|
return x<<7 + x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROTL\t"},
|
2017-04-18 17:05:31 -04:00
|
|
|
},
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-04-18 17:05:31 -04:00
|
|
|
func f9(x uint64) uint64 {
|
|
|
|
|
return x<<7 ^ x>>57
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tROTL\t"},
|
2017-04-18 17:05:31 -04:00
|
|
|
},
|
2017-08-29 11:49:08 -04:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f10(a uint32) uint32 {
|
|
|
|
|
return bits.RotateLeft32(a, 9)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tROTLW\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f11(a uint64) uint64 {
|
|
|
|
|
return bits.RotateLeft64(a, 37)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tROTL\t"},
|
|
|
|
|
},
|
|
|
|
|
|
2017-09-28 17:11:31 -04:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f12(a, b float64) float64 {
|
|
|
|
|
return math.Copysign(a, b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFCPSGN\t"},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f13(a float64) float64 {
|
|
|
|
|
return math.Abs(a)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFABS\t"},
|
|
|
|
|
},
|
|
|
|
|
|
2017-10-30 12:30:45 -04:00
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f14(b []byte) uint16 {
|
|
|
|
|
return binary.LittleEndian.Uint16(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVHZ\t"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f15(b []byte) uint32 {
|
|
|
|
|
return binary.LittleEndian.Uint32(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVWZ\t"},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f16(b []byte) uint64 {
|
|
|
|
|
return binary.LittleEndian.Uint64(b)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVD\t"},
|
|
|
|
|
neg: []string{"MOVBZ", "MOVHZ", "MOVWZ"},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f17(b []byte, v uint16) {
|
|
|
|
|
binary.LittleEndian.PutUint16(b, v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVH\t"},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f18(b []byte, v uint32) {
|
|
|
|
|
binary.LittleEndian.PutUint32(b, v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVW\t"},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
fn: `
|
|
|
|
|
func f19(b []byte, v uint64) {
|
|
|
|
|
binary.LittleEndian.PutUint64(b, v)
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tMOVD\t"},
|
|
|
|
|
neg: []string{"MOVB", "MOVH", "MOVW"},
|
|
|
|
|
},
|
|
|
|
|
|
2017-03-29 14:01:41 -04:00
|
|
|
{
|
|
|
|
|
// check that stack store is optimized away
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-03-29 14:01:41 -04:00
|
|
|
func $() int {
|
|
|
|
|
var x int
|
|
|
|
|
return *(&x)
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"TEXT\t.*, [$]0-8"},
|
2017-03-29 14:01:41 -04:00
|
|
|
},
|
2017-09-08 01:31:13 +01:00
|
|
|
// Constant propagation through raw bits conversions.
|
|
|
|
|
{
|
|
|
|
|
// uint32 constant converted to float32 constant
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x float32) float32 {
|
|
|
|
|
if x > math.Float32frombits(0x3f800000) {
|
|
|
|
|
return -x
|
|
|
|
|
}
|
|
|
|
|
return x
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
pos: []string{"\tFMOVS\t[$]f32.3f800000\\(SB\\)"},
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
// float32 constant converted to uint32 constant
|
|
|
|
|
fn: `
|
|
|
|
|
func $(x uint32) uint32 {
|
|
|
|
|
if x > math.Float32bits(1) {
|
|
|
|
|
return -x
|
|
|
|
|
}
|
|
|
|
|
return x
|
|
|
|
|
}
|
|
|
|
|
`,
|
|
|
|
|
neg: []string{"\tFMOVS\t"},
|
|
|
|
|
},
|
2017-03-13 14:39:17 -04:00
|
|
|
}
|
|
|
|
|
|
2017-08-26 23:05:36 +02:00
|
|
|
var plan9AMD64Tests = []*asmTest{
|
|
|
|
|
// We should make sure that the compiler doesn't generate floating point
|
|
|
|
|
// instructions for non-float operations on Plan 9, because floating point
|
|
|
|
|
// operations are not allowed in the note handler.
|
|
|
|
|
// Array zeroing.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-26 23:05:36 +02:00
|
|
|
func $() [16]byte {
|
|
|
|
|
var a [16]byte
|
|
|
|
|
return a
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t\\$0, \"\""},
|
2017-08-26 23:05:36 +02:00
|
|
|
},
|
|
|
|
|
// Array copy.
|
|
|
|
|
{
|
2017-08-28 09:55:18 -07:00
|
|
|
fn: `
|
2017-08-26 23:05:36 +02:00
|
|
|
func $(a [16]byte) (b [16]byte) {
|
|
|
|
|
b = a
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
`,
|
2017-08-28 09:55:18 -07:00
|
|
|
pos: []string{"\tMOVQ\t\"\"\\.a\\+[0-9]+\\(SP\\), (AX|CX)", "\tMOVQ\t(AX|CX), \"\"\\.b\\+[0-9]+\\(SP\\)"},
|
2017-08-26 23:05:36 +02:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
2016-06-30 06:36:31 -04:00
|
|
|
// TestLineNumber checks to make sure the generated assembly has line numbers
|
|
|
|
|
// see issue #16214
|
|
|
|
|
func TestLineNumber(t *testing.T) {
|
|
|
|
|
testenv.MustHaveGoBuild(t)
|
|
|
|
|
dir, err := ioutil.TempDir("", "TestLineNumber")
|
|
|
|
|
if err != nil {
|
|
|
|
|
t.Fatalf("could not create directory: %v", err)
|
|
|
|
|
}
|
|
|
|
|
defer os.RemoveAll(dir)
|
|
|
|
|
|
|
|
|
|
src := filepath.Join(dir, "x.go")
|
|
|
|
|
err = ioutil.WriteFile(src, []byte(issue16214src), 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
t.Fatalf("could not write file: %v", err)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-30 11:08:47 -07:00
|
|
|
cmd := exec.Command(testenv.GoToolPath(t), "tool", "compile", "-S", "-o", filepath.Join(dir, "out.o"), src)
|
2016-06-30 06:36:31 -04:00
|
|
|
out, err := cmd.CombinedOutput()
|
|
|
|
|
if err != nil {
|
|
|
|
|
t.Fatalf("fail to run go tool compile: %v", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if strings.Contains(string(out), "unknown line number") {
|
|
|
|
|
t.Errorf("line number missing in assembly:\n%s", out)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var issue16214src = `
|
|
|
|
|
package main
|
|
|
|
|
|
|
|
|
|
func Mod32(x uint32) uint32 {
|
2016-12-07 17:40:46 -08:00
|
|
|
return x % 3 // frontend rewrites it as HMUL with 2863311531, the LITERAL node has unknown Pos
|
2016-06-30 06:36:31 -04:00
|
|
|
}
|
|
|
|
|
`
|