go/src/cmd/internal/obj/arm/obj5.go

896 lines
21 KiB
Go
Raw Normal View History

// Derived from Inferno utils/5c/swt.c
// https://bitbucket.org/inferno-os/inferno-os/src/default/utils/5c/swt.c
//
// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
// Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
// Portions Copyright © 1997-1999 Vita Nuova Limited
// Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
// Portions Copyright © 2004,2006 Bruce Ellis
// Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
// Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
// Portions Copyright © 2009 The Go Authors. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
package arm
import (
"cmd/internal/obj"
"cmd/internal/sys"
"fmt"
"math"
)
var progedit_tlsfallback *obj.LSym
func progedit(ctxt *obj.Link, p *obj.Prog) {
p.From.Class = 0
p.To.Class = 0
// Rewrite B/BL to symbol as TYPE_BRANCH.
switch p.As {
case AB,
ABL,
obj.ADUFFZERO,
obj.ADUFFCOPY:
if p.To.Type == obj.TYPE_MEM && (p.To.Name == obj.NAME_EXTERN || p.To.Name == obj.NAME_STATIC) && p.To.Sym != nil {
p.To.Type = obj.TYPE_BRANCH
}
}
// Replace TLS register fetches on older ARM processors.
switch p.As {
// Treat MRC 15, 0, <reg>, C13, C0, 3 specially.
case AMRC:
if p.To.Offset&0xffff0fff == 0xee1d0f70 {
// Because the instruction might be rewritten to a BL which returns in R0
// the register must be zero.
if p.To.Offset&0xf000 != 0 {
ctxt.Diag("%v: TLS MRC instruction must write to R0 as it might get translated into a BL instruction", p.Line())
}
if obj.GOARM < 7 {
// Replace it with BL runtime.read_tls_fallback(SB) for ARM CPUs that lack the tls extension.
if progedit_tlsfallback == nil {
progedit_tlsfallback = obj.Linklookup(ctxt, "runtime.read_tls_fallback", 0)
}
// MOVW LR, R11
p.As = AMOVW
p.From.Type = obj.TYPE_REG
p.From.Reg = REGLINK
p.To.Type = obj.TYPE_REG
p.To.Reg = REGTMP
// BL runtime.read_tls_fallback(SB)
p = obj.Appendp(ctxt, p)
p.As = ABL
p.To.Type = obj.TYPE_BRANCH
p.To.Sym = progedit_tlsfallback
p.To.Offset = 0
// MOVW R11, LR
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_REG
p.From.Reg = REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = REGLINK
break
}
}
// Otherwise, MRC/MCR instructions need no further treatment.
p.As = AWORD
}
// Rewrite float constants to values stored in memory.
switch p.As {
case AMOVF:
if p.From.Type == obj.TYPE_FCONST && chipfloat5(ctxt, p.From.Val.(float64)) < 0 && (chipzero5(ctxt, p.From.Val.(float64)) < 0 || p.Scond&C_SCOND != C_SCOND_NONE) {
f32 := float32(p.From.Val.(float64))
i32 := math.Float32bits(f32)
literal := fmt.Sprintf("$f32.%08x", i32)
s := obj.Linklookup(ctxt, literal, 0)
p.From.Type = obj.TYPE_MEM
p.From.Sym = s
p.From.Name = obj.NAME_EXTERN
p.From.Offset = 0
}
case AMOVD:
if p.From.Type == obj.TYPE_FCONST && chipfloat5(ctxt, p.From.Val.(float64)) < 0 && (chipzero5(ctxt, p.From.Val.(float64)) < 0 || p.Scond&C_SCOND != C_SCOND_NONE) {
i64 := math.Float64bits(p.From.Val.(float64))
literal := fmt.Sprintf("$f64.%016x", i64)
s := obj.Linklookup(ctxt, literal, 0)
p.From.Type = obj.TYPE_MEM
p.From.Sym = s
p.From.Name = obj.NAME_EXTERN
p.From.Offset = 0
}
}
if ctxt.Flag_dynlink {
rewriteToUseGot(ctxt, p)
}
}
// Rewrite p, if necessary, to access global data via the global offset table.
func rewriteToUseGot(ctxt *obj.Link, p *obj.Prog) {
if p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO {
// ADUFFxxx $offset
// becomes
// MOVW runtime.duffxxx@GOT, R9
// ADD $offset, R9
// CALL (R9)
var sym *obj.LSym
if p.As == obj.ADUFFZERO {
sym = obj.Linklookup(ctxt, "runtime.duffzero", 0)
} else {
sym = obj.Linklookup(ctxt, "runtime.duffcopy", 0)
}
offset := p.To.Offset
p.As = AMOVW
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_GOTREF
p.From.Sym = sym
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R9
p.To.Name = obj.NAME_NONE
p.To.Offset = 0
p.To.Sym = nil
p1 := obj.Appendp(ctxt, p)
p1.As = AADD
p1.From.Type = obj.TYPE_CONST
p1.From.Offset = offset
p1.To.Type = obj.TYPE_REG
p1.To.Reg = REG_R9
p2 := obj.Appendp(ctxt, p1)
p2.As = obj.ACALL
p2.To.Type = obj.TYPE_MEM
p2.To.Reg = REG_R9
return
}
// We only care about global data: NAME_EXTERN means a global
// symbol in the Go sense, and p.Sym.Local is true for a few
// internally defined symbols.
if p.From.Type == obj.TYPE_ADDR && p.From.Name == obj.NAME_EXTERN && !p.From.Sym.Local() {
// MOVW $sym, Rx becomes MOVW sym@GOT, Rx
// MOVW $sym+<off>, Rx becomes MOVW sym@GOT, Rx; ADD <off>, Rx
if p.As != AMOVW {
ctxt.Diag("do not know how to handle TYPE_ADDR in %v with -dynlink", p)
}
if p.To.Type != obj.TYPE_REG {
ctxt.Diag("do not know how to handle LEAQ-type insn to non-register in %v with -dynlink", p)
}
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_GOTREF
if p.From.Offset != 0 {
q := obj.Appendp(ctxt, p)
q.As = AADD
q.From.Type = obj.TYPE_CONST
q.From.Offset = p.From.Offset
q.To = p.To
p.From.Offset = 0
}
}
if p.From3 != nil && p.From3.Name == obj.NAME_EXTERN {
ctxt.Diag("don't know how to handle %v with -dynlink", p)
}
var source *obj.Addr
// MOVx sym, Ry becomes MOVW sym@GOT, R9; MOVx (R9), Ry
// MOVx Ry, sym becomes MOVW sym@GOT, R9; MOVx Ry, (R9)
// An addition may be inserted between the two MOVs if there is an offset.
if p.From.Name == obj.NAME_EXTERN && !p.From.Sym.Local() {
if p.To.Name == obj.NAME_EXTERN && !p.To.Sym.Local() {
ctxt.Diag("cannot handle NAME_EXTERN on both sides in %v with -dynlink", p)
}
source = &p.From
} else if p.To.Name == obj.NAME_EXTERN && !p.To.Sym.Local() {
source = &p.To
} else {
return
}
if p.As == obj.ATEXT || p.As == obj.AFUNCDATA || p.As == obj.ACALL || p.As == obj.ARET || p.As == obj.AJMP {
return
}
if source.Sym.Type == obj.STLSBSS {
return
}
if source.Type != obj.TYPE_MEM {
ctxt.Diag("don't know how to handle %v with -dynlink", p)
}
p1 := obj.Appendp(ctxt, p)
p2 := obj.Appendp(ctxt, p1)
p1.As = AMOVW
p1.From.Type = obj.TYPE_MEM
p1.From.Sym = source.Sym
p1.From.Name = obj.NAME_GOTREF
p1.To.Type = obj.TYPE_REG
p1.To.Reg = REG_R9
p2.As = p.As
p2.From = p.From
p2.To = p.To
if p.From.Name == obj.NAME_EXTERN {
p2.From.Reg = REG_R9
p2.From.Name = obj.NAME_NONE
p2.From.Sym = nil
} else if p.To.Name == obj.NAME_EXTERN {
p2.To.Reg = REG_R9
p2.To.Name = obj.NAME_NONE
p2.To.Sym = nil
} else {
return
}
obj.Nopout(p)
}
// Prog.mark
const (
FOLL = 1 << 0
LABEL = 1 << 1
LEAF = 1 << 2
)
func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
autosize := int32(0)
ctxt.Cursym = cursym
if cursym.Text == nil || cursym.Text.Link == nil {
return
}
softfloat(ctxt, cursym)
p := cursym.Text
autoffset := int32(p.To.Offset)
if autoffset < 0 {
autoffset = 0
}
cursym.Locals = autoffset
cursym.Args = p.To.Val.(int32)
/*
* find leaf subroutines
* strip NOPs
* expand RET
* expand BECOME pseudo
*/
var q1 *obj.Prog
var q *obj.Prog
for p := cursym.Text; p != nil; p = p.Link {
switch p.As {
case obj.ATEXT:
p.Mark |= LEAF
case obj.ARET:
break
case ADIV, ADIVU, AMOD, AMODU:
q = p
if ctxt.Sym_div == nil {
initdiv(ctxt)
}
cursym.Text.Mark &^= LEAF
continue
case obj.ANOP:
q1 = p.Link
q.Link = q1 /* q is non-nop */
if q1 != nil {
q1.Mark |= p.Mark
}
continue
case ABL,
ABX,
obj.ADUFFZERO,
obj.ADUFFCOPY:
cursym.Text.Mark &^= LEAF
fallthrough
case AB,
ABEQ,
ABNE,
ABCS,
ABHS,
ABCC,
ABLO,
ABMI,
ABPL,
ABVS,
ABVC,
ABHI,
ABLS,
ABGE,
ABLT,
ABGT,
ABLE:
q1 = p.Pcond
if q1 != nil {
for q1.As == obj.ANOP {
q1 = q1.Link
p.Pcond = q1
}
}
}
q = p
}
var q2 *obj.Prog
for p := cursym.Text; p != nil; p = p.Link {
o := p.As
switch o {
case obj.ATEXT:
autosize = int32(p.To.Offset + 4)
if autosize <= 4 {
if cursym.Text.Mark&LEAF != 0 {
p.To.Offset = -4
autosize = 0
}
}
if autosize == 0 && cursym.Text.Mark&LEAF == 0 {
if ctxt.Debugvlog {
ctxt.Logf("save suppressed in: %s\n", cursym.Name)
}
cursym.Text.Mark |= LEAF
}
if cursym.Text.Mark&LEAF != 0 {
cursym.Set(obj.AttrLeaf, true)
if autosize == 0 {
break
}
}
if p.From3.Offset&obj.NOSPLIT == 0 {
p = stacksplit(ctxt, p, autosize) // emit split check
}
// MOVW.W R14,$-autosize(SP)
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.Scond |= C_WBIT
p.From.Type = obj.TYPE_REG
p.From.Reg = REGLINK
p.To.Type = obj.TYPE_MEM
p.To.Offset = int64(-autosize)
p.To.Reg = REGSP
p.Spadj = autosize
if cursym.Text.From3.Offset&obj.WRAPPER != 0 {
// if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame
//
// MOVW g_panic(g), R1
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// CMP $0, R1
// B.NE checkargp
// end:
// NOP
// ... function ...
// checkargp:
// MOVW panic_argp(R1), R2
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// ADD $(autosize+4), R13, R3
// CMP R2, R3
// B.NE end
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// ADD $4, R13, R4
// MOVW R4, panic_argp(R1)
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// B end
//
// The NOP is needed to give the jumps somewhere to land.
// It is a liblink NOP, not an ARM NOP: it encodes to 0 instruction bytes.
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_MEM
p.From.Reg = REGG
p.From.Offset = 4 * int64(ctxt.Arch.PtrSize) // G.panic
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R1
p = obj.Appendp(ctxt, p)
p.As = ACMP
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0
p.Reg = REG_R1
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// B.NE checkargp
bne := obj.Appendp(ctxt, p)
bne.As = ABNE
bne.To.Type = obj.TYPE_BRANCH
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// end: NOP
end := obj.Appendp(ctxt, bne)
end.As = obj.ANOP
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// find end of function
var last *obj.Prog
for last = end; last.Link != nil; last = last.Link {
}
// MOVW panic_argp(R1), R2
mov := obj.Appendp(ctxt, last)
mov.As = AMOVW
mov.From.Type = obj.TYPE_MEM
mov.From.Reg = REG_R1
mov.From.Offset = 0 // Panic.argp
mov.To.Type = obj.TYPE_REG
mov.To.Reg = REG_R2
// B.NE branch target is MOVW above
bne.Pcond = mov
// ADD $(autosize+4), R13, R3
p = obj.Appendp(ctxt, mov)
p.As = AADD
p.From.Type = obj.TYPE_CONST
p.From.Offset = int64(autosize) + 4
p.Reg = REG_R13
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R3
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// CMP R2, R3
p = obj.Appendp(ctxt, p)
p.As = ACMP
p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R2
p.Reg = REG_R3
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// B.NE end
p = obj.Appendp(ctxt, p)
p.As = ABNE
p.To.Type = obj.TYPE_BRANCH
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
p.Pcond = end
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// ADD $4, R13, R4
p = obj.Appendp(ctxt, p)
p.As = AADD
p.From.Type = obj.TYPE_CONST
p.From.Offset = 4
p.Reg = REG_R13
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R4
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// MOVW R4, panic_argp(R1)
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R4
p.To.Type = obj.TYPE_MEM
p.To.Reg = REG_R1
p.To.Offset = 0 // Panic.argp
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// B end
p = obj.Appendp(ctxt, p)
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
p.As = AB
p.To.Type = obj.TYPE_BRANCH
p.Pcond = end
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// reset for subsequent passes
p = end
}
case obj.ARET:
nocache(p)
if cursym.Text.Mark&LEAF != 0 {
if autosize == 0 {
p.As = AB
p.From = obj.Addr{}
if p.To.Sym != nil { // retjmp
p.To.Type = obj.TYPE_BRANCH
} else {
p.To.Type = obj.TYPE_MEM
p.To.Offset = 0
p.To.Reg = REGLINK
}
break
}
}
p.As = AMOVW
p.Scond |= C_PBIT
p.From.Type = obj.TYPE_MEM
p.From.Offset = int64(autosize)
p.From.Reg = REGSP
p.To.Type = obj.TYPE_REG
p.To.Reg = REGPC
// If there are instructions following
// this ARET, they come from a branch
// with the same stackframe, so no spadj.
if p.To.Sym != nil { // retjmp
p.To.Reg = REGLINK
q2 = obj.Appendp(ctxt, p)
q2.As = AB
q2.To.Type = obj.TYPE_BRANCH
q2.To.Sym = p.To.Sym
p.To.Sym = nil
p = q2
}
case AADD:
if p.From.Type == obj.TYPE_CONST && p.From.Reg == 0 && p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP {
p.Spadj = int32(-p.From.Offset)
}
case ASUB:
if p.From.Type == obj.TYPE_CONST && p.From.Reg == 0 && p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP {
p.Spadj = int32(p.From.Offset)
}
case ADIV, ADIVU, AMOD, AMODU:
if cursym.Text.From3.Offset&obj.NOSPLIT != 0 {
ctxt.Diag("cannot divide in NOSPLIT function")
}
if ctxt.Debugdivmod {
break
}
if p.From.Type != obj.TYPE_REG {
break
}
if p.To.Type != obj.TYPE_REG {
break
}
// Make copy because we overwrite p below.
q1 := *p
if q1.Reg == REGTMP || q1.Reg == 0 && q1.To.Reg == REGTMP {
ctxt.Diag("div already using REGTMP: %v", p)
}
/* MOV m(g),REGTMP */
p.As = AMOVW
p.Pos = q1.Pos
p.From.Type = obj.TYPE_MEM
p.From.Reg = REGG
p.From.Offset = 6 * 4 // offset of g.m
p.Reg = 0
p.To.Type = obj.TYPE_REG
p.To.Reg = REGTMP
/* MOV a,m_divmod(REGTMP) */
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.Pos = q1.Pos
p.From.Type = obj.TYPE_REG
p.From.Reg = q1.From.Reg
p.To.Type = obj.TYPE_MEM
p.To.Reg = REGTMP
p.To.Offset = 8 * 4 // offset of m.divmod
cmd/link: insert trampolines for too-far jumps on ARM ARM direct CALL/JMP instruction has 24 bit offset, which can only encodes jumps within +/-32M. When the target is too far, the top bits get truncated and the program jumps wild. This CL detects too-far jumps and automatically insert trampolines, currently only internal linking on ARM. It is necessary to make the following changes to the linker: - Resolve direct jump relocs when assigning addresses to functions. this allows trampoline insertion without moving all code that already laid down. - Lay down packages in dependency order, so that when resolving a inter-package direct jump reloc, the target address is already known. Intra-package jumps are assumed never too far. - a linker flag -debugtramp is added for debugging trampolines: "-debugtramp=1 -v" prints trampoline debug message "-debugtramp=2" forces all inter-package jump to use trampolines (currently ARM only) "-debugtramp=2 -v" does both - Some data structures are changed for bookkeeping. On ARM, pseudo DIV/DIVU/MOD/MODU instructions now clobber R8 (unfortunate). In the standard library there is no ARM assembly code that uses these instructions, and the compiler no longer emits them (CL 29390). all.bash passes with -debugtramp=2, except a disassembly test (this is unavoidable as we changed the instruction). TBD: debug info of trampolines? Fixes #17028. Change-Id: Idcce347ea7e0af77c4079041a160b2f6e114b474 Reviewed-on: https://go-review.googlesource.com/29397 Reviewed-by: David Crawshaw <crawshaw@golang.org> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-09-14 14:47:12 -04:00
/* MOV b, R8 */
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.Pos = q1.Pos
p.From.Type = obj.TYPE_REG
p.From.Reg = q1.Reg
if q1.Reg == 0 {
p.From.Reg = q1.To.Reg
}
p.To.Type = obj.TYPE_REG
cmd/link: insert trampolines for too-far jumps on ARM ARM direct CALL/JMP instruction has 24 bit offset, which can only encodes jumps within +/-32M. When the target is too far, the top bits get truncated and the program jumps wild. This CL detects too-far jumps and automatically insert trampolines, currently only internal linking on ARM. It is necessary to make the following changes to the linker: - Resolve direct jump relocs when assigning addresses to functions. this allows trampoline insertion without moving all code that already laid down. - Lay down packages in dependency order, so that when resolving a inter-package direct jump reloc, the target address is already known. Intra-package jumps are assumed never too far. - a linker flag -debugtramp is added for debugging trampolines: "-debugtramp=1 -v" prints trampoline debug message "-debugtramp=2" forces all inter-package jump to use trampolines (currently ARM only) "-debugtramp=2 -v" does both - Some data structures are changed for bookkeeping. On ARM, pseudo DIV/DIVU/MOD/MODU instructions now clobber R8 (unfortunate). In the standard library there is no ARM assembly code that uses these instructions, and the compiler no longer emits them (CL 29390). all.bash passes with -debugtramp=2, except a disassembly test (this is unavoidable as we changed the instruction). TBD: debug info of trampolines? Fixes #17028. Change-Id: Idcce347ea7e0af77c4079041a160b2f6e114b474 Reviewed-on: https://go-review.googlesource.com/29397 Reviewed-by: David Crawshaw <crawshaw@golang.org> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-09-14 14:47:12 -04:00
p.To.Reg = REG_R8
p.To.Offset = 0
/* CALL appropriate */
p = obj.Appendp(ctxt, p)
p.As = ABL
p.Pos = q1.Pos
p.To.Type = obj.TYPE_BRANCH
switch o {
case ADIV:
p.To.Sym = ctxt.Sym_div
case ADIVU:
p.To.Sym = ctxt.Sym_divu
case AMOD:
p.To.Sym = ctxt.Sym_mod
case AMODU:
p.To.Sym = ctxt.Sym_modu
}
/* MOV REGTMP, b */
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.Pos = q1.Pos
p.From.Type = obj.TYPE_REG
p.From.Reg = REGTMP
p.From.Offset = 0
p.To.Type = obj.TYPE_REG
p.To.Reg = q1.To.Reg
case AMOVW:
if (p.Scond&C_WBIT != 0) && p.To.Type == obj.TYPE_MEM && p.To.Reg == REGSP {
p.Spadj = int32(-p.To.Offset)
}
if (p.Scond&C_PBIT != 0) && p.From.Type == obj.TYPE_MEM && p.From.Reg == REGSP && p.To.Reg != REGPC {
p.Spadj = int32(-p.From.Offset)
}
if p.From.Type == obj.TYPE_ADDR && p.From.Reg == REGSP && p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP {
p.Spadj = int32(-p.From.Offset)
}
}
}
}
func isfloatreg(a *obj.Addr) bool {
return a.Type == obj.TYPE_REG && REG_F0 <= a.Reg && a.Reg <= REG_F15
}
func softfloat(ctxt *obj.Link, cursym *obj.LSym) {
if obj.GOARM > 5 {
return
}
symsfloat := obj.Linklookup(ctxt, "_sfloat", 0)
wasfloat := 0
for p := cursym.Text; p != nil; p = p.Link {
if p.Pcond != nil {
p.Pcond.Mark |= LABEL
}
}
var next *obj.Prog
for p := cursym.Text; p != nil; p = p.Link {
switch p.As {
case AMOVW:
if isfloatreg(&p.To) || isfloatreg(&p.From) {
goto soft
}
goto notsoft
case AMOVWD,
AMOVWF,
AMOVDW,
AMOVFW,
AMOVFD,
AMOVDF,
AMOVF,
AMOVD,
ACMPF,
ACMPD,
AADDF,
AADDD,
ASUBF,
ASUBD,
AMULF,
AMULD,
ADIVF,
ADIVD,
ASQRTF,
ASQRTD,
AABSF,
AABSD,
ANEGF,
ANEGD:
goto soft
default:
goto notsoft
}
soft:
if wasfloat == 0 || (p.Mark&LABEL != 0) {
next = ctxt.NewProg()
*next = *p
// BL _sfloat(SB)
*p = obj.Prog{}
p.Ctxt = ctxt
p.Link = next
p.As = ABL
p.To.Type = obj.TYPE_BRANCH
p.To.Sym = symsfloat
p.Pos = next.Pos
p = next
wasfloat = 1
}
continue
notsoft:
wasfloat = 0
}
}
func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// MOVW g_stackguard(g), R1
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_MEM
p.From.Reg = REGG
p.From.Offset = 2 * int64(ctxt.Arch.PtrSize) // G.stackguard0
if ctxt.Cursym.CFunc() {
p.From.Offset = 3 * int64(ctxt.Arch.PtrSize) // G.stackguard1
}
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R1
if framesize <= obj.StackSmall {
// small stack: SP < stackguard
// CMP stackguard, SP
p = obj.Appendp(ctxt, p)
p.As = ACMP
p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R1
p.Reg = REGSP
} else if framesize <= obj.StackBig {
// large stack: SP-framesize < stackguard-StackSmall
// MOVW $-framesize(SP), R2
// CMP stackguard, R2
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_ADDR
p.From.Reg = REGSP
p.From.Offset = int64(-framesize)
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R2
p = obj.Appendp(ctxt, p)
p.As = ACMP
p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R1
p.Reg = REG_R2
} else {
// Such a large stack we need to protect against wraparound
// if SP is close to zero.
// SP-stackguard+StackGuard < framesize + (StackGuard-StackSmall)
// The +StackGuard on both sides is required to keep the left side positive:
// SP is allowed to be slightly below stackguard. See stack.h.
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// CMP $StackPreempt, R1
// MOVW.NE $StackGuard(SP), R2
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// SUB.NE R1, R2
// MOVW.NE $(framesize+(StackGuard-StackSmall)), R3
cmd/internal/obj/arm: improve static branch prediction for wrapper prologue This is a follow-up to CL 36893. Move the unlikely branch in the wrapper prologue to the end of the function, where it has minimal impact on the instruction cache. Static branch prediction is also less likely to choose a forward branch. Updates #19042 sort benchmarks: name old time/op new time/op delta SearchWrappers-4 1.44µs ± 0% 1.45µs ± 0% +1.15% (p=0.000 n=9+10) SortString1K-4 1.02ms ± 0% 1.04ms ± 0% +2.39% (p=0.000 n=10+10) SortString1K_Slice-4 960µs ± 0% 989µs ± 0% +2.95% (p=0.000 n=9+10) StableString1K-4 218µs ± 0% 213µs ± 0% -2.13% (p=0.000 n=10+10) SortInt1K-4 541µs ± 0% 543µs ± 0% +0.30% (p=0.003 n=9+10) StableInt1K-4 760µs ± 1% 763µs ± 1% +0.38% (p=0.011 n=10+10) StableInt1K_Slice-4 840µs ± 1% 779µs ± 0% -7.31% (p=0.000 n=9+10) SortInt64K-4 55.2ms ± 0% 55.4ms ± 1% +0.34% (p=0.012 n=10+8) SortInt64K_Slice-4 56.2ms ± 0% 55.6ms ± 1% -1.16% (p=0.000 n=10+10) StableInt64K-4 70.9ms ± 1% 71.0ms ± 0% ~ (p=0.315 n=10+7) Sort1e2-4 250µs ± 0% 249µs ± 1% ~ (p=0.315 n=9+10) Stable1e2-4 600µs ± 0% 594µs ± 0% -1.09% (p=0.000 n=9+10) Sort1e4-4 51.2ms ± 0% 51.4ms ± 1% +0.40% (p=0.001 n=9+10) Stable1e4-4 204ms ± 1% 199ms ± 1% -2.27% (p=0.000 n=10+10) Sort1e6-4 8.42s ± 0% 8.44s ± 0% +0.28% (p=0.000 n=8+9) Stable1e6-4 43.3s ± 0% 42.5s ± 1% -1.89% (p=0.000 n=9+9) Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e Reviewed-on: https://go-review.googlesource.com/37611 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-02-28 14:35:37 -08:00
// CMP.NE R3, R2
p = obj.Appendp(ctxt, p)
p.As = ACMP
p.From.Type = obj.TYPE_CONST
p.From.Offset = int64(uint32(obj.StackPreempt & (1<<32 - 1)))
p.Reg = REG_R1
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_ADDR
p.From.Reg = REGSP
p.From.Offset = obj.StackGuard
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R2
p.Scond = C_SCOND_NE
p = obj.Appendp(ctxt, p)
p.As = ASUB
p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R1
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R2
p.Scond = C_SCOND_NE
p = obj.Appendp(ctxt, p)
p.As = AMOVW
p.From.Type = obj.TYPE_ADDR
p.From.Offset = int64(framesize) + (obj.StackGuard - obj.StackSmall)
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R3
p.Scond = C_SCOND_NE
p = obj.Appendp(ctxt, p)
p.As = ACMP
p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R3
p.Reg = REG_R2
p.Scond = C_SCOND_NE
}
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
// BLS call-to-morestack
bls := obj.Appendp(ctxt, p)
bls.As = ABLS
bls.To.Type = obj.TYPE_BRANCH
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
var last *obj.Prog
for last = ctxt.Cursym.Text; last.Link != nil; last = last.Link {
}
// Now we are at the end of the function, but logically
// we are still in function prologue. We need to fix the
// SP data and PCDATA.
spfix := obj.Appendp(ctxt, last)
spfix.As = obj.ANOP
spfix.Spadj = -framesize
pcdata := obj.Appendp(ctxt, spfix)
pcdata.Pos = ctxt.Cursym.Text.Pos
pcdata.As = obj.APCDATA
pcdata.From.Type = obj.TYPE_CONST
pcdata.From.Offset = obj.PCDATA_StackMapIndex
pcdata.To.Type = obj.TYPE_CONST
pcdata.To.Offset = -1 // pcdata starts at -1 at function entry
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
// MOVW LR, R3
movw := obj.Appendp(ctxt, pcdata)
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
movw.As = AMOVW
movw.From.Type = obj.TYPE_REG
movw.From.Reg = REGLINK
movw.To.Type = obj.TYPE_REG
movw.To.Reg = REG_R3
bls.Pcond = movw
// BL runtime.morestack
call := obj.Appendp(ctxt, movw)
call.As = obj.ACALL
call.To.Type = obj.TYPE_BRANCH
morestack := "runtime.morestack"
switch {
case ctxt.Cursym.CFunc():
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
morestack = "runtime.morestackc"
case ctxt.Cursym.Text.From3.Offset&obj.NEEDCTXT == 0:
morestack = "runtime.morestack_noctxt"
}
call.To.Sym = obj.Linklookup(ctxt, morestack, 0)
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
// B start
b := obj.Appendp(ctxt, call)
b.As = obj.AJMP
b.To.Type = obj.TYPE_BRANCH
b.Pcond = ctxt.Cursym.Text.Link
b.Spadj = +framesize
runtime, cmd/internal/obj/arm: improve arm function prologue When stack growth is not needed, as it usually is not, execute only a single conditional branch rather than three conditional instructions. This adds 4 bytes to every function, but might speed up execution in the common case. Sample disassembly for func f() { _ = [128]byte{} } Before: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb028 MOVW 0x28(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 91a0300e MOVW.LS R14, R3 x.go:3 0x2014 9b0118a9 BL.LS runtime.morestack_noctxt(SB) x.go:3 0x2018 9afffff8 B.LS main.f(SB) x.go:3 0x201c e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2020 e28d1004 ADD $4, R13, R1 x.go:4 0x2024 e3a00000 MOVW $0, R0 x.go:4 0x2028 eb012255 BL 0x4a984 x.go:5 0x202c e49df084 RET #132 x.go:5 0x2030 eafffffe B 0x2030 x.go:5 0x2034 ffffff7c ? After: TEXT main.f(SB) x.go x.go:3 0x2000 e59a1008 MOVW 0x8(R10), R1 x.go:3 0x2004 e59fb02c MOVW 0x2c(R15), R11 x.go:3 0x2008 e08d200b ADD R11, R13, R2 x.go:3 0x200c e1520001 CMP R1, R2 x.go:3 0x2010 9a000004 B.LS 0x2028 x.go:3 0x2014 e52de084 MOVW.W R14, -0x84(R13) x.go:4 0x2018 e28d1004 ADD $4, R13, R1 x.go:4 0x201c e3a00000 MOVW $0, R0 x.go:4 0x2020 eb0124dc BL 0x4b398 x.go:5 0x2024 e49df084 RET #132 x.go:5 0x2028 e1a0300e MOVW R14, R3 x.go:5 0x202c eb011b0d BL runtime.morestack_noctxt(SB) x.go:5 0x2030 eafffff2 B main.f(SB) x.go:5 0x2034 eafffffe B 0x2034 x.go:5 0x2038 ffffff7c ? Updates #10587. package sort benchmarks on an iPhone 6: name old time/op new time/op delta SortString1K 569µs ± 0% 565µs ± 1% -0.75% (p=0.000 n=23+24) StableString1K 872µs ± 1% 870µs ± 1% -0.16% (p=0.009 n=23+24) SortInt1K 317µs ± 2% 316µs ± 2% ~ (p=0.410 n=26+26) StableInt1K 343µs ± 1% 339µs ± 1% -1.07% (p=0.000 n=22+23) SortInt64K 30.0ms ± 1% 30.0ms ± 1% ~ (p=0.091 n=25+24) StableInt64K 30.2ms ± 0% 30.0ms ± 0% -0.69% (p=0.000 n=22+22) Sort1e2 147µs ± 1% 146µs ± 0% -0.48% (p=0.000 n=25+24) Stable1e2 290µs ± 1% 286µs ± 1% -1.30% (p=0.000 n=23+24) Sort1e4 29.5ms ± 2% 29.7ms ± 1% +0.71% (p=0.000 n=23+23) Stable1e4 88.7ms ± 4% 88.6ms ± 8% -0.07% (p=0.022 n=26+26) Sort1e6 4.81s ± 7% 4.83s ± 7% ~ (p=0.192 n=26+26) Stable1e6 18.3s ± 1% 18.1s ± 1% -0.76% (p=0.000 n=25+23) SearchWrappers 318ns ± 1% 344ns ± 1% +8.14% (p=0.000 n=23+26) package sort benchmarks on a first generation rpi: name old time/op new time/op delta SearchWrappers 4.13µs ± 0% 3.95µs ± 0% -4.42% (p=0.000 n=15+13) SortString1K 5.81ms ± 1% 5.82ms ± 2% ~ (p=0.400 n=14+15) StableString1K 9.69ms ± 1% 9.73ms ± 0% ~ (p=0.121 n=15+11) SortInt1K 3.30ms ± 2% 3.66ms ±19% +10.82% (p=0.000 n=15+14) StableInt1K 5.97ms ±15% 4.17ms ± 8% -30.05% (p=0.000 n=15+15) SortInt64K 319ms ± 1% 295ms ± 1% -7.65% (p=0.000 n=15+15) StableInt64K 343ms ± 0% 332ms ± 0% -3.26% (p=0.000 n=12+13) Sort1e2 3.36ms ± 2% 3.22ms ± 4% -4.10% (p=0.000 n=15+15) Stable1e2 6.74ms ± 1% 6.43ms ± 2% -4.67% (p=0.000 n=15+15) Sort1e4 247ms ± 1% 247ms ± 1% ~ (p=0.331 n=15+14) Stable1e4 864ms ± 0% 820ms ± 0% -5.15% (p=0.000 n=14+15) Sort1e6 41.2s ± 0% 41.2s ± 0% +0.15% (p=0.000 n=13+14) Stable1e6 192s ± 0% 182s ± 0% -5.07% (p=0.000 n=14+14) Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b Reviewed-on: https://go-review.googlesource.com/10497 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-28 15:18:35 -07:00
return bls
}
func initdiv(ctxt *obj.Link) {
if ctxt.Sym_div != nil {
return
}
ctxt.Sym_div = obj.Linklookup(ctxt, "_div", 0)
ctxt.Sym_divu = obj.Linklookup(ctxt, "_divu", 0)
ctxt.Sym_mod = obj.Linklookup(ctxt, "_mod", 0)
ctxt.Sym_modu = obj.Linklookup(ctxt, "_modu", 0)
}
var unaryDst = map[obj.As]bool{
ASWI: true,
AWORD: true,
}
var Linkarm = obj.LinkArch{
Arch: sys.ArchARM,
Preprocess: preprocess,
Assemble: span5,
Progedit: progedit,
UnaryDst: unaryDst,
}