| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | // asmcheck | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Copyright 2018 The Go Authors. All rights reserved. | 
					
						
							|  |  |  | // Use of this source code is governed by a BSD-style | 
					
						
							|  |  |  | // license that can be found in the LICENSE file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | package codegen | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import "math/bits" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-03-05 19:46:18 +01:00
										 |  |  | // ----------------------- // | 
					
						
							|  |  |  | //    bits.LeadingZeros    // | 
					
						
							|  |  |  | // ----------------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func LeadingZeros(n uint) int { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSRQ" | 
					
						
							|  |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-05 19:46:18 +01:00
										 |  |  | 	return bits.LeadingZeros(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func LeadingZeros64(n uint64) int { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSRQ" | 
					
						
							|  |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-05 19:46:18 +01:00
										 |  |  | 	return bits.LeadingZeros64(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func LeadingZeros32(n uint32) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize LeadingZeros(16|32) on amd64
Introduce Len8 and Len16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
Also use and optimize the Len32 lowering, along the same lines.
Leave Len8 unused for the moment; a subsequent CL will enable it.
For 16 and 32 bits, this leads to a speed-up.
name              old time/op  new time/op  delta
LeadingZeros16-8  1.42ns ± 5%  1.23ns ± 5%  -13.42%  (p=0.000 n=20+20)
LeadingZeros32-8  1.25ns ± 5%  1.03ns ± 5%  -17.63%  (p=0.000 n=20+16)
Code:
func f16(x uint16) { z = bits.LeadingZeros16(x) }
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f16 STEXT nosplit size=38 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BSRQ	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	$-1, CX
	0x0013 00019 (x.go:8)	CMOVQEQ	CX, AX
	0x0017 00023 (x.go:8)	ADDQ	$-15, AX
	0x001b 00027 (x.go:8)	NEGQ	AX
	0x001e 00030 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0025 00037 (x.go:8)	RET
"".f32 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	BSRQ	AX, AX
	0x0008 00008 (x.go:9)	MOVQ	$-1, CX
	0x000f 00015 (x.go:9)	CMOVQEQ	CX, AX
	0x0013 00019 (x.go:9)	ADDQ	$-31, AX
	0x0017 00023 (x.go:9)	NEGQ	AX
	0x001a 00026 (x.go:9)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:9)	RET
After:
"".f16 STEXT nosplit size=30 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	LEAL	1(AX)(AX*1), AX
	0x000c 00012 (x.go:8)	BSRL	AX, AX
	0x000f 00015 (x.go:8)	ADDQ	$-16, AX
	0x0013 00019 (x.go:8)	NEGQ	AX
	0x0016 00022 (x.go:8)	MOVQ	AX, "".z(SB)
	0x001d 00029 (x.go:8)	RET
"".f32 STEXT nosplit size=28 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	LEAQ	1(AX)(AX*1), AX
	0x0009 00009 (x.go:9)	BSRQ	AX, AX
	0x000d 00013 (x.go:9)	ADDQ	$-32, AX
	0x0011 00017 (x.go:9)	NEGQ	AX
	0x0014 00020 (x.go:9)	MOVQ	AX, "".z(SB)
	0x001b 00027 (x.go:9)	RET
Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b
Reviewed-on: https://go-review.googlesource.com/108941
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 14:54:45 -07:00
										 |  |  | 	// amd64:"BSRQ","LEAQ",-"CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize math/bits Len32 intrinsic on arm64
Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32.
Function LeadingZeros32 calls Len32, with this change, the assembly code of
LeadingZeros32 becomes more concise.
Go code:
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0004 00004 (test.go:7)        MOVWU   "".x(FP), R0
        0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ     R0, R0
        0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB     $32, R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
After:
"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0004 00004 (test.go:7)        MOVWU   "".x(FP), R0
        0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW    R0, R0
        0x000c 00012 (test.go:7)        MOVD    R0, "".z(SB)
        0x0018 00024 (test.go:7)        RET     (R30)
Benchmarks:
name              old time/op  new time/op  delta
LeadingZeros-8    2.53ns ± 0%  2.55ns ± 0%   +0.67%  (p=0.000 n=10+10)
LeadingZeros8-8   3.56ns ± 0%  3.56ns ± 0%     ~     (all equal)
LeadingZeros16-8  3.55ns ± 0%  3.56ns ± 0%     ~     (p=0.465 n=10+10)
LeadingZeros32-8  3.55ns ± 0%  2.96ns ± 0%  -16.71%  (p=0.000 n=10+7)
LeadingZeros64-8  2.53ns ± 0%  2.54ns ± 0%     ~     (p=0.059 n=8+10)
Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a
Reviewed-on: https://go-review.googlesource.com/c/157000
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2019-01-02 09:14:26 +00:00
										 |  |  | 	// arm:"CLZ" arm64:"CLZW" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-05 19:46:18 +01:00
										 |  |  | 	return bits.LeadingZeros32(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func LeadingZeros16(n uint16) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize LeadingZeros(16|32) on amd64
Introduce Len8 and Len16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
Also use and optimize the Len32 lowering, along the same lines.
Leave Len8 unused for the moment; a subsequent CL will enable it.
For 16 and 32 bits, this leads to a speed-up.
name              old time/op  new time/op  delta
LeadingZeros16-8  1.42ns ± 5%  1.23ns ± 5%  -13.42%  (p=0.000 n=20+20)
LeadingZeros32-8  1.25ns ± 5%  1.03ns ± 5%  -17.63%  (p=0.000 n=20+16)
Code:
func f16(x uint16) { z = bits.LeadingZeros16(x) }
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f16 STEXT nosplit size=38 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BSRQ	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	$-1, CX
	0x0013 00019 (x.go:8)	CMOVQEQ	CX, AX
	0x0017 00023 (x.go:8)	ADDQ	$-15, AX
	0x001b 00027 (x.go:8)	NEGQ	AX
	0x001e 00030 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0025 00037 (x.go:8)	RET
"".f32 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	BSRQ	AX, AX
	0x0008 00008 (x.go:9)	MOVQ	$-1, CX
	0x000f 00015 (x.go:9)	CMOVQEQ	CX, AX
	0x0013 00019 (x.go:9)	ADDQ	$-31, AX
	0x0017 00023 (x.go:9)	NEGQ	AX
	0x001a 00026 (x.go:9)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:9)	RET
After:
"".f16 STEXT nosplit size=30 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	LEAL	1(AX)(AX*1), AX
	0x000c 00012 (x.go:8)	BSRL	AX, AX
	0x000f 00015 (x.go:8)	ADDQ	$-16, AX
	0x0013 00019 (x.go:8)	NEGQ	AX
	0x0016 00022 (x.go:8)	MOVQ	AX, "".z(SB)
	0x001d 00029 (x.go:8)	RET
"".f32 STEXT nosplit size=28 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	LEAQ	1(AX)(AX*1), AX
	0x0009 00009 (x.go:9)	BSRQ	AX, AX
	0x000d 00013 (x.go:9)	ADDQ	$-32, AX
	0x0011 00017 (x.go:9)	NEGQ	AX
	0x0014 00020 (x.go:9)	MOVQ	AX, "".z(SB)
	0x001b 00027 (x.go:9)	RET
Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b
Reviewed-on: https://go-review.googlesource.com/108941
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 14:54:45 -07:00
										 |  |  | 	// amd64:"BSRL","LEAL",-"CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-05 19:46:18 +01:00
										 |  |  | 	return bits.LeadingZeros16(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func LeadingZeros8(n uint8) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: use intrinsic for LeadingZeros8 on amd64
The previous change sped up the pure computation form of LeadingZeros8.
This places it somewhat close to the table lookup form.
Depending on something that varies from toolchain to toolchain
(alignment, perhaps?), the slowdown from ditching the table lookup
is either 20% or 5%.
This benchmark is the best case scenario for the table lookup:
It is in the L1 cache already.
I think we're close enough that we can switch to the computational version,
and trust that the memory effects and binary size savings will be worth it.
Code:
func f8(x uint8)   { z = bits.LeadingZeros8(x) }
Before:
"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	LEAQ	math/bits.len8tab(SB), CX
	0x000f 00015 (x.go:7)	MOVBLZX	(CX)(AX*1), AX
	0x0013 00019 (x.go:7)	ADDQ	$-8, AX
	0x0017 00023 (x.go:7)	NEGQ	AX
	0x001a 00026 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:7)	RET
After:
"".f8 STEXT nosplit size=30 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	LEAL	1(AX)(AX*1), AX
	0x000c 00012 (x.go:7)	BSRL	AX, AX
	0x000f 00015 (x.go:7)	ADDQ	$-8, AX
	0x0013 00019 (x.go:7)	NEGQ	AX
	0x0016 00022 (x.go:7)	MOVQ	AX, "".z(SB)
	0x001d 00029 (x.go:7)	RET
Change-Id: Icc7db50a7820fb9a3da8a816d6b6940d7f8e193e
Reviewed-on: https://go-review.googlesource.com/108942
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 15:38:50 -07:00
										 |  |  | 	// amd64:"BSRL","LEAL",-"CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-05 19:46:18 +01:00
										 |  |  | 	return bits.LeadingZeros8(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | // --------------- // | 
					
						
							|  |  |  | //    bits.Len*    // | 
					
						
							|  |  |  | // --------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Len(n uint) int { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSRQ" | 
					
						
							|  |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | 	return bits.Len(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Len64(n uint64) int { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSRQ" | 
					
						
							|  |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | 	return bits.Len64(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Len32(n uint32) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize LeadingZeros(16|32) on amd64
Introduce Len8 and Len16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
Also use and optimize the Len32 lowering, along the same lines.
Leave Len8 unused for the moment; a subsequent CL will enable it.
For 16 and 32 bits, this leads to a speed-up.
name              old time/op  new time/op  delta
LeadingZeros16-8  1.42ns ± 5%  1.23ns ± 5%  -13.42%  (p=0.000 n=20+20)
LeadingZeros32-8  1.25ns ± 5%  1.03ns ± 5%  -17.63%  (p=0.000 n=20+16)
Code:
func f16(x uint16) { z = bits.LeadingZeros16(x) }
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f16 STEXT nosplit size=38 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BSRQ	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	$-1, CX
	0x0013 00019 (x.go:8)	CMOVQEQ	CX, AX
	0x0017 00023 (x.go:8)	ADDQ	$-15, AX
	0x001b 00027 (x.go:8)	NEGQ	AX
	0x001e 00030 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0025 00037 (x.go:8)	RET
"".f32 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	BSRQ	AX, AX
	0x0008 00008 (x.go:9)	MOVQ	$-1, CX
	0x000f 00015 (x.go:9)	CMOVQEQ	CX, AX
	0x0013 00019 (x.go:9)	ADDQ	$-31, AX
	0x0017 00023 (x.go:9)	NEGQ	AX
	0x001a 00026 (x.go:9)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:9)	RET
After:
"".f16 STEXT nosplit size=30 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	LEAL	1(AX)(AX*1), AX
	0x000c 00012 (x.go:8)	BSRL	AX, AX
	0x000f 00015 (x.go:8)	ADDQ	$-16, AX
	0x0013 00019 (x.go:8)	NEGQ	AX
	0x0016 00022 (x.go:8)	MOVQ	AX, "".z(SB)
	0x001d 00029 (x.go:8)	RET
"".f32 STEXT nosplit size=28 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	LEAQ	1(AX)(AX*1), AX
	0x0009 00009 (x.go:9)	BSRQ	AX, AX
	0x000d 00013 (x.go:9)	ADDQ	$-32, AX
	0x0011 00017 (x.go:9)	NEGQ	AX
	0x0014 00020 (x.go:9)	MOVQ	AX, "".z(SB)
	0x001b 00027 (x.go:9)	RET
Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b
Reviewed-on: https://go-review.googlesource.com/108941
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 14:54:45 -07:00
										 |  |  | 	// amd64:"BSRQ","LEAQ",-"CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | 	return bits.Len32(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Len16(n uint16) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize LeadingZeros(16|32) on amd64
Introduce Len8 and Len16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
Also use and optimize the Len32 lowering, along the same lines.
Leave Len8 unused for the moment; a subsequent CL will enable it.
For 16 and 32 bits, this leads to a speed-up.
name              old time/op  new time/op  delta
LeadingZeros16-8  1.42ns ± 5%  1.23ns ± 5%  -13.42%  (p=0.000 n=20+20)
LeadingZeros32-8  1.25ns ± 5%  1.03ns ± 5%  -17.63%  (p=0.000 n=20+16)
Code:
func f16(x uint16) { z = bits.LeadingZeros16(x) }
func f32(x uint32) { z = bits.LeadingZeros32(x) }
Before:
"".f16 STEXT nosplit size=38 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BSRQ	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	$-1, CX
	0x0013 00019 (x.go:8)	CMOVQEQ	CX, AX
	0x0017 00023 (x.go:8)	ADDQ	$-15, AX
	0x001b 00027 (x.go:8)	NEGQ	AX
	0x001e 00030 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0025 00037 (x.go:8)	RET
"".f32 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	BSRQ	AX, AX
	0x0008 00008 (x.go:9)	MOVQ	$-1, CX
	0x000f 00015 (x.go:9)	CMOVQEQ	CX, AX
	0x0013 00019 (x.go:9)	ADDQ	$-31, AX
	0x0017 00023 (x.go:9)	NEGQ	AX
	0x001a 00026 (x.go:9)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:9)	RET
After:
"".f16 STEXT nosplit size=30 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	LEAL	1(AX)(AX*1), AX
	0x000c 00012 (x.go:8)	BSRL	AX, AX
	0x000f 00015 (x.go:8)	ADDQ	$-16, AX
	0x0013 00019 (x.go:8)	NEGQ	AX
	0x0016 00022 (x.go:8)	MOVQ	AX, "".z(SB)
	0x001d 00029 (x.go:8)	RET
"".f32 STEXT nosplit size=28 args=0x8 locals=0x0
	0x0000 00000 (x.go:9)	TEXT	"".f32(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:9)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:9)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:9)	MOVL	"".x+8(SP), AX
	0x0004 00004 (x.go:9)	LEAQ	1(AX)(AX*1), AX
	0x0009 00009 (x.go:9)	BSRQ	AX, AX
	0x000d 00013 (x.go:9)	ADDQ	$-32, AX
	0x0011 00017 (x.go:9)	NEGQ	AX
	0x0014 00020 (x.go:9)	MOVQ	AX, "".z(SB)
	0x001b 00027 (x.go:9)	RET
Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b
Reviewed-on: https://go-review.googlesource.com/108941
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 14:54:45 -07:00
										 |  |  | 	// amd64:"BSRL","LEAL",-"CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | 	return bits.Len16(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Len8(n uint8) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: use intrinsic for LeadingZeros8 on amd64
The previous change sped up the pure computation form of LeadingZeros8.
This places it somewhat close to the table lookup form.
Depending on something that varies from toolchain to toolchain
(alignment, perhaps?), the slowdown from ditching the table lookup
is either 20% or 5%.
This benchmark is the best case scenario for the table lookup:
It is in the L1 cache already.
I think we're close enough that we can switch to the computational version,
and trust that the memory effects and binary size savings will be worth it.
Code:
func f8(x uint8)   { z = bits.LeadingZeros8(x) }
Before:
"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	LEAQ	math/bits.len8tab(SB), CX
	0x000f 00015 (x.go:7)	MOVBLZX	(CX)(AX*1), AX
	0x0013 00019 (x.go:7)	ADDQ	$-8, AX
	0x0017 00023 (x.go:7)	NEGQ	AX
	0x001a 00026 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:7)	RET
After:
"".f8 STEXT nosplit size=30 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	LEAL	1(AX)(AX*1), AX
	0x000c 00012 (x.go:7)	BSRL	AX, AX
	0x000f 00015 (x.go:7)	ADDQ	$-8, AX
	0x0013 00019 (x.go:7)	NEGQ	AX
	0x0016 00022 (x.go:7)	MOVQ	AX, "".z(SB)
	0x001d 00029 (x.go:7)	RET
Change-Id: Icc7db50a7820fb9a3da8a816d6b6940d7f8e193e
Reviewed-on: https://go-review.googlesource.com/108942
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 15:38:50 -07:00
										 |  |  | 	// amd64:"BSRL","LEAL",-"CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							|  |  |  | 	// arm:"CLZ" arm64:"CLZ" | 
					
						
							|  |  |  | 	// mips:"CLZ" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Clz" | 
					
						
							| 
									
										
										
										
											2018-03-02 15:16:27 +01:00
										 |  |  | 	return bits.Len8(n) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | // -------------------- // | 
					
						
							|  |  |  | //    bits.OnesCount    // | 
					
						
							|  |  |  | // -------------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | // amd64:".*x86HasPOPCNT" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | func OnesCount(n uint) int { | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | 	// amd64:"POPCNTQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// arm64:"VCNT","VUADDLV" | 
					
						
							| 
									
										
											  
											
												cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2018-05-25 17:54:58 +01:00
										 |  |  | 	// s390x:"POPCNT" | 
					
						
							| 
									
										
										
										
											2018-10-12 20:56:12 +02:00
										 |  |  | 	// ppc64:"POPCNTD" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"POPCNTD" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Popcnt" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | 	return bits.OnesCount(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | // amd64:".*x86HasPOPCNT" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | func OnesCount64(n uint64) int { | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | 	// amd64:"POPCNTQ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// arm64:"VCNT","VUADDLV" | 
					
						
							| 
									
										
											  
											
												cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2018-05-25 17:54:58 +01:00
										 |  |  | 	// s390x:"POPCNT" | 
					
						
							| 
									
										
										
										
											2018-10-12 20:56:12 +02:00
										 |  |  | 	// ppc64:"POPCNTD" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"POPCNTD" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Popcnt" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | 	return bits.OnesCount64(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | // amd64:".*x86HasPOPCNT" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | func OnesCount32(n uint32) int { | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | 	// amd64:"POPCNTL" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// arm64:"VCNT","VUADDLV" | 
					
						
							| 
									
										
											  
											
												cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2018-05-25 17:54:58 +01:00
										 |  |  | 	// s390x:"POPCNT" | 
					
						
							| 
									
										
										
										
											2018-10-12 20:56:12 +02:00
										 |  |  | 	// ppc64:"POPCNTW" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"POPCNTW" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Popcnt" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | 	return bits.OnesCount32(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | // amd64:".*x86HasPOPCNT" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | func OnesCount16(n uint16) int { | 
					
						
							| 
									
										
										
										
											2019-12-19 10:58:28 -08:00
										 |  |  | 	// amd64:"POPCNTL" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// arm64:"VCNT","VUADDLV" | 
					
						
							| 
									
										
											  
											
												cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2018-05-25 17:54:58 +01:00
										 |  |  | 	// s390x:"POPCNT" | 
					
						
							| 
									
										
										
										
											2018-10-12 20:56:12 +02:00
										 |  |  | 	// ppc64:"POPCNTW" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"POPCNTW" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Popcnt" | 
					
						
							| 
									
										
										
										
											2018-03-06 12:55:41 +01:00
										 |  |  | 	return bits.OnesCount16(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2018-05-25 17:54:58 +01:00
										 |  |  | func OnesCount8(n uint8) int { | 
					
						
							|  |  |  | 	// s390x:"POPCNT" | 
					
						
							| 
									
										
										
										
											2018-10-12 20:56:12 +02:00
										 |  |  | 	// ppc64:"POPCNTB" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"POPCNTB" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Popcnt" | 
					
						
							| 
									
										
											  
											
												cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
											
										 
											2018-05-25 17:54:58 +01:00
										 |  |  | 	return bits.OnesCount8(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-03-06 20:10:35 +01:00
										 |  |  | // ----------------------- // | 
					
						
							|  |  |  | //    bits.ReverseBytes    // | 
					
						
							|  |  |  | // ----------------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func ReverseBytes(n uint) uint { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSWAPQ" | 
					
						
							|  |  |  | 	// s390x:"MOVDBR" | 
					
						
							|  |  |  | 	// arm64:"REV" | 
					
						
							| 
									
										
										
										
											2018-03-06 20:10:35 +01:00
										 |  |  | 	return bits.ReverseBytes(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func ReverseBytes64(n uint64) uint64 { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSWAPQ" | 
					
						
							|  |  |  | 	// s390x:"MOVDBR" | 
					
						
							|  |  |  | 	// arm64:"REV" | 
					
						
							| 
									
										
										
										
											2018-03-06 20:10:35 +01:00
										 |  |  | 	return bits.ReverseBytes64(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func ReverseBytes32(n uint32) uint32 { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSWAPL" | 
					
						
							|  |  |  | 	// s390x:"MOVWBR" | 
					
						
							|  |  |  | 	// arm64:"REVW" | 
					
						
							| 
									
										
										
										
											2018-03-06 20:10:35 +01:00
										 |  |  | 	return bits.ReverseBytes32(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func ReverseBytes16(n uint16) uint16 { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"ROLW" | 
					
						
							| 
									
										
											  
											
												cmd/compile: add an optimaztion rule for math/bits.ReverseBytes16 on arm64
On amd64 ReverseBytes16 is lowered to a rotate instruction. However arm64 doesn't
have 16-bit rotate instruction, but has a REV16W instruction which can be used
for ReverseBytes16. This CL adds a rule to turn the patterns like (x<<8) | (x>>8)
(the type of x is uint16, and "|" can also be "^" or "+") to a REV16W instruction.
Code:
func reverseBytes16(i uint16) uint16 { return bits.ReverseBytes16(i) }
Before:
        0x0004 00004 (test.go:6)        MOVHU   "".i(FP), R0
        0x0008 00008 ($GOROOT/src/math/bits/bits.go:262)        UBFX    $8, R0, $8, R1
        0x000c 00012 ($GOROOT/src/math/bits/bits.go:262)        ORR     R0<<8, R1, R0
        0x0010 00016 (test.go:6)        MOVH    R0, "".~r1+8(FP)
        0x0014 00020 (test.go:6)        RET     (R30)
After:
        0x0000 00000 (test.go:6)        MOVHU   "".i(FP), R0
        0x0004 00004 (test.go:6)        REV16W  R0, R0
        0x0008 00008 (test.go:6)        MOVH    R0, "".~r1+8(FP)
        0x000c 00012 (test.go:6)        RET     (R30)
Benchmarks:
name                old time/op       new time/op       delta
ReverseBytes-224    1.000000ns +- 0%  1.000000ns +- 0%     ~     (all equal)
ReverseBytes16-224  1.500000ns +- 0%  1.000000ns +- 0%  -33.33%  (p=0.000 n=9+10)
ReverseBytes32-224  1.000000ns +- 0%  1.000000ns +- 0%     ~     (all equal)
ReverseBytes64-224  1.000000ns +- 0%  1.000000ns +- 0%     ~     (all equal)
Change-Id: I87cd41b2d8e549bf39c601f185d5775bd42d739c
Reviewed-on: https://go-review.googlesource.com/c/157757
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-02-11 06:37:49 +00:00
										 |  |  | 	// arm64:"REV16W",-"UBFX",-"ORR" | 
					
						
							| 
									
										
										
										
											2019-02-11 09:40:02 +00:00
										 |  |  | 	// arm/5:"SLL","SRL","ORR" | 
					
						
							|  |  |  | 	// arm/6:"REV16" | 
					
						
							|  |  |  | 	// arm/7:"REV16" | 
					
						
							| 
									
										
										
										
											2018-03-06 20:10:35 +01:00
										 |  |  | 	return bits.ReverseBytes16(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | // --------------------- // | 
					
						
							|  |  |  | //    bits.RotateLeft    // | 
					
						
							|  |  |  | // --------------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func RotateLeft64(n uint64) uint64 { | 
					
						
							|  |  |  | 	// amd64:"ROLQ" | 
					
						
							|  |  |  | 	// arm64:"ROR" | 
					
						
							|  |  |  | 	// ppc64:"ROTL" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"ROTL" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// s390x:"RLLG" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Rotl" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	return bits.RotateLeft64(n, 37) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func RotateLeft32(n uint32) uint32 { | 
					
						
							|  |  |  | 	// amd64:"ROLL" 386:"ROLL" | 
					
						
							| 
									
										
										
										
											2019-08-28 14:32:10 -04:00
										 |  |  | 	// arm:`MOVW\tR[0-9]+@>23` | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// arm64:"RORW" | 
					
						
							|  |  |  | 	// ppc64:"ROTLW" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"ROTLW" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// s390x:"RLL" | 
					
						
							| 
									
										
										
										
											2019-05-17 15:16:38 -06:00
										 |  |  | 	// wasm:"I32Rotl" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	return bits.RotateLeft32(n, 9) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func RotateLeft16(n uint16) uint16 { | 
					
						
							|  |  |  | 	// amd64:"ROLW" 386:"ROLW" | 
					
						
							|  |  |  | 	return bits.RotateLeft16(n, 5) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func RotateLeft8(n uint8) uint8 { | 
					
						
							|  |  |  | 	// amd64:"ROLB" 386:"ROLB" | 
					
						
							|  |  |  | 	return bits.RotateLeft8(n, 5) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | func RotateLeftVariable(n uint, m int) uint { | 
					
						
							|  |  |  | 	// amd64:"ROLQ" | 
					
						
							| 
									
										
										
										
											2018-06-30 06:48:51 +00:00
										 |  |  | 	// arm64:"ROR" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// ppc64:"ROTL" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"ROTL" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// s390x:"RLLG" | 
					
						
							| 
									
										
										
										
											2019-05-17 15:16:38 -06:00
										 |  |  | 	// wasm:"I64Rotl" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	return bits.RotateLeft(n, m) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func RotateLeftVariable64(n uint64, m int) uint64 { | 
					
						
							|  |  |  | 	// amd64:"ROLQ" | 
					
						
							| 
									
										
										
										
											2018-06-30 06:48:51 +00:00
										 |  |  | 	// arm64:"ROR" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// ppc64:"ROTL" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"ROTL" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// s390x:"RLLG" | 
					
						
							| 
									
										
										
										
											2019-05-17 15:16:38 -06:00
										 |  |  | 	// wasm:"I64Rotl" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	return bits.RotateLeft64(n, m) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func RotateLeftVariable32(n uint32, m int) uint32 { | 
					
						
							| 
									
										
										
										
											2019-08-02 02:20:38 +00:00
										 |  |  | 	// arm:`MOVW\tR[0-9]+@>R[0-9]+` | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// amd64:"ROLL" | 
					
						
							| 
									
										
										
										
											2018-06-30 06:48:51 +00:00
										 |  |  | 	// arm64:"RORW" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// ppc64:"ROTLW" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64le:"ROTLW" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	// s390x:"RLL" | 
					
						
							| 
									
										
										
										
											2019-05-17 15:16:38 -06:00
										 |  |  | 	// wasm:"I32Rotl" | 
					
						
							| 
									
										
										
										
											2018-09-03 10:47:58 -04:00
										 |  |  | 	return bits.RotateLeft32(n, m) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | // ------------------------ // | 
					
						
							|  |  |  | //    bits.TrailingZeros    // | 
					
						
							|  |  |  | // ------------------------ // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TrailingZeros(n uint) int { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ" | 
					
						
							| 
									
										
										
										
											2019-03-15 08:49:38 +01:00
										 |  |  | 	// arm:"CLZ" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 	// arm64:"RBIT","CLZ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							| 
									
										
										
										
											2019-02-08 16:18:12 -02:00
										 |  |  | 	// ppc64/power8:"ANDN","POPCNTD" | 
					
						
							|  |  |  | 	// ppc64le/power8:"ANDN","POPCNTD" | 
					
						
							|  |  |  | 	// ppc64/power9: "CNTTZD" | 
					
						
							|  |  |  | 	// ppc64le/power9: "CNTTZD" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Ctz" | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | 	return bits.TrailingZeros(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TrailingZeros64(n uint64) int { | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 	// arm64:"RBIT","CLZ" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR" | 
					
						
							| 
									
										
										
										
											2019-02-08 16:18:12 -02:00
										 |  |  | 	// ppc64/power8:"ANDN","POPCNTD" | 
					
						
							|  |  |  | 	// ppc64le/power8:"ANDN","POPCNTD" | 
					
						
							|  |  |  | 	// ppc64/power9: "CNTTZD" | 
					
						
							|  |  |  | 	// ppc64le/power9: "CNTTZD" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Ctz" | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | 	return bits.TrailingZeros64(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TrailingZeros32(n uint32) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: add patterns for bit set/clear/complement on amd64
This patch completes implementation of BT(Q|L), and adds support
for BT(S|R|C)(Q|L).
Example of code changes from time.(*Time).addSec:
        if t.wall&hasMonotonic != 0 {
  0x1073465               488b08                  MOVQ 0(AX), CX
  0x1073468               4889ca                  MOVQ CX, DX
  0x107346b               48c1e93f                SHRQ $0x3f, CX
  0x107346f               48c1e13f                SHLQ $0x3f, CX
  0x1073473               48f7c1ffffffff          TESTQ $-0x1, CX
  0x107347a               746b                    JE 0x10734e7
        if t.wall&hasMonotonic != 0 {
  0x1073435               488b08                  MOVQ 0(AX), CX
  0x1073438               480fbae13f              BTQ $0x3f, CX
  0x107343d               7363                    JAE 0x10734a2
Another example:
                        t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic
  0x10734c8               4881e1ffffff3f          ANDQ $0x3fffffff, CX
  0x10734cf               48c1e61e                SHLQ $0x1e, SI
  0x10734d3               4809ce                  ORQ CX, SI
  0x10734d6               48b90000000000000080    MOVQ $0x8000000000000000, CX
  0x10734e0               4809f1                  ORQ SI, CX
  0x10734e3               488908                  MOVQ CX, 0(AX)
                        t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic
  0x107348b		4881e2ffffff3f		ANDQ $0x3fffffff, DX
  0x1073492		48c1e61e		SHLQ $0x1e, SI
  0x1073496		4809f2			ORQ SI, DX
  0x1073499		480fbaea3f		BTSQ $0x3f, DX
  0x107349e		488910			MOVQ DX, 0(AX)
Go1 benchmarks seem unaffected, and I would be surprised
otherwise:
name                     old time/op    new time/op     delta
BinaryTree17-4              2.64s ± 4%      2.56s ± 9%  -2.92%  (p=0.008 n=9+9)
Fannkuch11-4                2.90s ± 1%      2.95s ± 3%  +1.76%  (p=0.010 n=10+9)
FmtFprintfEmpty-4          35.3ns ± 1%     34.5ns ± 2%  -2.34%  (p=0.004 n=9+8)
FmtFprintfString-4         57.0ns ± 1%     58.4ns ± 5%  +2.52%  (p=0.029 n=9+10)
FmtFprintfInt-4            59.8ns ± 3%     59.8ns ± 6%    ~     (p=0.565 n=10+10)
FmtFprintfIntInt-4         93.9ns ± 3%     91.2ns ± 5%  -2.94%  (p=0.014 n=10+9)
FmtFprintfPrefixedInt-4     107ns ± 6%      104ns ± 6%    ~     (p=0.099 n=10+10)
FmtFprintfFloat-4           187ns ± 3%      188ns ± 3%    ~     (p=0.505 n=10+9)
FmtManyArgs-4               410ns ± 1%      415ns ± 6%    ~     (p=0.649 n=8+10)
GobDecode-4                5.30ms ± 3%     5.27ms ± 3%    ~     (p=0.436 n=10+10)
GobEncode-4                4.62ms ± 5%     4.47ms ± 2%  -3.24%  (p=0.001 n=9+10)
Gzip-4                      197ms ± 4%      193ms ± 3%    ~     (p=0.123 n=10+10)
Gunzip-4                   30.4ms ± 3%     30.1ms ± 3%    ~     (p=0.481 n=10+10)
HTTPClientServer-4         76.3µs ± 1%     76.0µs ± 1%    ~     (p=0.236 n=8+9)
JSONEncode-4               10.5ms ± 9%     10.3ms ± 3%    ~     (p=0.280 n=10+10)
JSONDecode-4               42.3ms ±10%     41.3ms ± 2%    ~     (p=0.053 n=9+10)
Mandelbrot200-4            3.80ms ± 2%     3.72ms ± 2%  -2.15%  (p=0.001 n=9+10)
GoParse-4                  2.88ms ±10%     2.81ms ± 2%    ~     (p=0.247 n=10+10)
RegexpMatchEasy0_32-4      69.5ns ± 4%     68.6ns ± 2%    ~     (p=0.171 n=10+10)
RegexpMatchEasy0_1K-4       165ns ± 3%      162ns ± 3%    ~     (p=0.137 n=10+10)
RegexpMatchEasy1_32-4      65.7ns ± 6%     64.4ns ± 2%  -2.02%  (p=0.037 n=10+10)
RegexpMatchEasy1_1K-4       278ns ± 2%      279ns ± 3%    ~     (p=0.991 n=8+9)
RegexpMatchMedium_32-4     99.3ns ± 3%     98.5ns ± 4%    ~     (p=0.457 n=10+9)
RegexpMatchMedium_1K-4     30.1µs ± 1%     30.4µs ± 2%    ~     (p=0.173 n=8+10)
RegexpMatchHard_32-4       1.40µs ± 2%     1.41µs ± 4%    ~     (p=0.565 n=10+10)
RegexpMatchHard_1K-4       42.5µs ± 1%     41.5µs ± 3%  -2.13%  (p=0.002 n=8+9)
Revcomp-4                   332ms ± 4%      328ms ± 5%    ~     (p=0.720 n=9+10)
Template-4                 48.3ms ± 2%     49.6ms ± 3%  +2.56%  (p=0.002 n=8+10)
TimeParse-4                 252ns ± 2%      249ns ± 3%    ~     (p=0.116 n=9+10)
TimeFormat-4                262ns ± 4%      252ns ± 3%  -4.01%  (p=0.000 n=9+10)
name                     old speed      new speed       delta
GobDecode-4               145MB/s ± 3%    146MB/s ± 3%    ~     (p=0.436 n=10+10)
GobEncode-4               166MB/s ± 5%    172MB/s ± 2%  +3.28%  (p=0.001 n=9+10)
Gzip-4                   98.6MB/s ± 4%  100.4MB/s ± 3%    ~     (p=0.123 n=10+10)
Gunzip-4                  639MB/s ± 3%    645MB/s ± 3%    ~     (p=0.481 n=10+10)
JSONEncode-4              185MB/s ± 8%    189MB/s ± 3%    ~     (p=0.280 n=10+10)
JSONDecode-4             46.0MB/s ± 9%   47.0MB/s ± 2%  +2.21%  (p=0.046 n=9+10)
GoParse-4                20.1MB/s ± 9%   20.6MB/s ± 2%    ~     (p=0.239 n=10+10)
RegexpMatchEasy0_32-4     460MB/s ± 4%    467MB/s ± 2%    ~     (p=0.165 n=10+10)
RegexpMatchEasy0_1K-4    6.19GB/s ± 3%   6.28GB/s ± 3%    ~     (p=0.165 n=10+10)
RegexpMatchEasy1_32-4     487MB/s ± 5%    497MB/s ± 2%  +2.00%  (p=0.043 n=10+10)
RegexpMatchEasy1_1K-4    3.67GB/s ± 2%   3.67GB/s ± 3%    ~     (p=0.963 n=8+9)
RegexpMatchMedium_32-4   10.1MB/s ± 3%   10.1MB/s ± 4%    ~     (p=0.435 n=10+9)
RegexpMatchMedium_1K-4   34.0MB/s ± 1%   33.7MB/s ± 2%    ~     (p=0.173 n=8+10)
RegexpMatchHard_32-4     22.9MB/s ± 2%   22.7MB/s ± 4%    ~     (p=0.565 n=10+10)
RegexpMatchHard_1K-4     24.0MB/s ± 3%   24.7MB/s ± 3%  +2.64%  (p=0.001 n=9+9)
Revcomp-4                 766MB/s ± 4%    775MB/s ± 5%    ~     (p=0.720 n=9+10)
Template-4               40.2MB/s ± 2%   39.2MB/s ± 3%  -2.47%  (p=0.002 n=8+10)
The rules match ~1800 times during all.bash.
Fixes #18943
Change-Id: I64be1ada34e89c486dfd935bf429b35652117ed4
Reviewed-on: https://go-review.googlesource.com/94766
Run-TryBot: Giovanni Bajo <rasky@develer.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-02-17 13:54:03 +01:00
										 |  |  | 	// amd64:"BTSQ\\t\\$32","BSFQ" | 
					
						
							| 
									
										
										
										
											2019-03-15 08:49:38 +01:00
										 |  |  | 	// arm:"CLZ" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 	// arm64:"RBITW","CLZW" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR","MOVWZ" | 
					
						
							| 
									
										
										
										
											2019-02-08 16:18:12 -02:00
										 |  |  | 	// ppc64/power8:"ANDN","POPCNTW" | 
					
						
							|  |  |  | 	// ppc64le/power8:"ANDN","POPCNTW" | 
					
						
							|  |  |  | 	// ppc64/power9: "CNTTZW" | 
					
						
							|  |  |  | 	// ppc64le/power9: "CNTTZW" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Ctz" | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | 	return bits.TrailingZeros32(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TrailingZeros16(n uint16) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize TrailingZeros(8|16) on amd64
Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
name               old time/op  new time/op  delta
TrailingZeros8-8   1.33ns ± 6%  0.84ns ± 3%  -36.90%  (p=0.000 n=20+20)
TrailingZeros16-8  1.26ns ± 5%  0.84ns ± 5%  -33.50%  (p=0.000 n=20+18)
Code:
func f8(x uint8)   { z = bits.TrailingZeros8(x) }
func f16(x uint16) { z = bits.TrailingZeros16(x) }
Before:
"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	BTSQ	$8, AX
	0x000d 00013 (x.go:7)	BSFQ	AX, AX
	0x0011 00017 (x.go:7)	MOVL	$64, CX
	0x0016 00022 (x.go:7)	CMOVQEQ	CX, AX
	0x001a 00026 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:7)	RET
"".f16 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BTSQ	$16, AX
	0x000d 00013 (x.go:8)	BSFQ	AX, AX
	0x0011 00017 (x.go:8)	MOVL	$64, CX
	0x0016 00022 (x.go:8)	CMOVQEQ	CX, AX
	0x001a 00026 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:8)	RET
After:
"".f8 STEXT nosplit size=20 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	BTSL	$8, AX
	0x0009 00009 (x.go:7)	BSFL	AX, AX
	0x000c 00012 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0013 00019 (x.go:7)	RET
"".f16 STEXT nosplit size=20 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	BTSL	$16, AX
	0x0009 00009 (x.go:8)	BSFL	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0013 00019 (x.go:8)	RET
Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11
Reviewed-on: https://go-review.googlesource.com/108940
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 14:46:41 -07:00
										 |  |  | 	// amd64:"BSFL","BTSL\\t\\$16" | 
					
						
							| 
									
										
										
										
											2019-08-30 06:24:58 +00:00
										 |  |  | 	// 386:"BSFL\t" | 
					
						
							| 
									
										
										
										
											2019-03-15 08:49:38 +01:00
										 |  |  | 	// arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 	// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR","OR\t\\$65536" | 
					
						
							| 
									
										
										
										
											2019-02-08 16:18:12 -02:00
										 |  |  | 	// ppc64/power8:"POPCNTD","OR\\t\\$65536" | 
					
						
							|  |  |  | 	// ppc64le/power8:"POPCNTD","OR\\t\\$65536" | 
					
						
							|  |  |  | 	// ppc64/power9:"CNTTZD","OR\\t\\$65536" | 
					
						
							|  |  |  | 	// ppc64le/power9:"CNTTZD","OR\\t\\$65536" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Ctz" | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | 	return bits.TrailingZeros16(n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func TrailingZeros8(n uint8) int { | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize TrailingZeros(8|16) on amd64
Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.
name               old time/op  new time/op  delta
TrailingZeros8-8   1.33ns ± 6%  0.84ns ± 3%  -36.90%  (p=0.000 n=20+20)
TrailingZeros16-8  1.26ns ± 5%  0.84ns ± 5%  -33.50%  (p=0.000 n=20+18)
Code:
func f8(x uint8)   { z = bits.TrailingZeros8(x) }
func f16(x uint16) { z = bits.TrailingZeros16(x) }
Before:
"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	BTSQ	$8, AX
	0x000d 00013 (x.go:7)	BSFQ	AX, AX
	0x0011 00017 (x.go:7)	MOVL	$64, CX
	0x0016 00022 (x.go:7)	CMOVQEQ	CX, AX
	0x001a 00026 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:7)	RET
"".f16 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BTSQ	$16, AX
	0x000d 00013 (x.go:8)	BSFQ	AX, AX
	0x0011 00017 (x.go:8)	MOVL	$64, CX
	0x0016 00022 (x.go:8)	CMOVQEQ	CX, AX
	0x001a 00026 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:8)	RET
After:
"".f8 STEXT nosplit size=20 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	BTSL	$8, AX
	0x0009 00009 (x.go:7)	BSFL	AX, AX
	0x000c 00012 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0013 00019 (x.go:7)	RET
"".f16 STEXT nosplit size=20 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	BTSL	$16, AX
	0x0009 00009 (x.go:8)	BSFL	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0013 00019 (x.go:8)	RET
Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11
Reviewed-on: https://go-review.googlesource.com/108940
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Giovanni Bajo <rasky@develer.com>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2018-04-23 14:46:41 -07:00
										 |  |  | 	// amd64:"BSFL","BTSL\\t\\$8" | 
					
						
							| 
									
										
										
										
											2019-03-15 08:49:38 +01:00
										 |  |  | 	// arm:"ORR\t\\$256","CLZ",-"MOVBU\tR" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 	// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t" | 
					
						
							| 
									
										
										
										
											2018-03-08 17:43:55 +01:00
										 |  |  | 	// s390x:"FLOGR","OR\t\\$256" | 
					
						
							| 
									
										
										
										
											2019-03-05 01:56:17 +01:00
										 |  |  | 	// wasm:"I64Ctz" | 
					
						
							| 
									
										
										
										
											2018-03-06 09:39:14 +01:00
										 |  |  | 	return bits.TrailingZeros8(n) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2018-04-25 11:52:06 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | // IterateBitsNN checks special handling of TrailingZerosNN when the input is known to be non-zero. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func IterateBits(n uint) int { | 
					
						
							|  |  |  | 	i := 0 | 
					
						
							|  |  |  | 	for n != 0 { | 
					
						
							|  |  |  | 		// amd64:"BSFQ",-"CMOVEQ" | 
					
						
							|  |  |  | 		i += bits.TrailingZeros(n) | 
					
						
							|  |  |  | 		n &= n - 1 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return i | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func IterateBits64(n uint64) int { | 
					
						
							|  |  |  | 	i := 0 | 
					
						
							|  |  |  | 	for n != 0 { | 
					
						
							|  |  |  | 		// amd64:"BSFQ",-"CMOVEQ" | 
					
						
							|  |  |  | 		i += bits.TrailingZeros64(n) | 
					
						
							|  |  |  | 		n &= n - 1 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return i | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func IterateBits32(n uint32) int { | 
					
						
							|  |  |  | 	i := 0 | 
					
						
							|  |  |  | 	for n != 0 { | 
					
						
							|  |  |  | 		// amd64:"BSFL",-"BTSQ" | 
					
						
							|  |  |  | 		i += bits.TrailingZeros32(n) | 
					
						
							|  |  |  | 		n &= n - 1 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return i | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func IterateBits16(n uint16) int { | 
					
						
							|  |  |  | 	i := 0 | 
					
						
							|  |  |  | 	for n != 0 { | 
					
						
							|  |  |  | 		// amd64:"BSFL",-"BTSL" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 		// arm64:"RBITW","CLZW",-"ORR" | 
					
						
							| 
									
										
										
										
											2018-04-25 11:52:06 -07:00
										 |  |  | 		i += bits.TrailingZeros16(n) | 
					
						
							|  |  |  | 		n &= n - 1 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return i | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func IterateBits8(n uint8) int { | 
					
						
							|  |  |  | 	i := 0 | 
					
						
							|  |  |  | 	for n != 0 { | 
					
						
							|  |  |  | 		// amd64:"BSFL",-"BTSL" | 
					
						
							| 
									
										
											  
											
												cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.
Benchmarks:
name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)
code:
func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }
Before:
"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)
This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
After:
"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)
The situation of TrailingZeros8 is similar to TrailingZeros16.
Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-03 09:25:06 +00:00
										 |  |  | 		// arm64:"RBITW","CLZW",-"ORR" | 
					
						
							| 
									
										
										
										
											2018-04-25 11:52:06 -07:00
										 |  |  | 		i += bits.TrailingZeros8(n) | 
					
						
							|  |  |  | 		n &= n - 1 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return i | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2018-08-14 16:41:22 -06:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | // --------------- // | 
					
						
							|  |  |  | //    bits.Add*    // | 
					
						
							|  |  |  | // --------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add(x, y, ci uint) (r, co uint) { | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"NEGL","ADCQ","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE","ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	return bits.Add(x, y, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func AddC(x, ci uint) (r, co uint) { | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"NEGL","ADCQ","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE","ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	return bits.Add(x, 7, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func AddZ(x, y uint) (r, co uint) { | 
					
						
							| 
									
										
										
										
											2019-03-21 03:24:47 +00:00
										 |  |  | 	// arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDC",-"ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	return bits.Add(x, y, 0) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func AddR(x, y, ci uint) uint { | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADDS","ADCS",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE","ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	r, _ := bits.Add(x, y, ci) | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | func AddM(p, q, r *[3]uint) { | 
					
						
							|  |  |  | 	var c uint | 
					
						
							|  |  |  | 	r[0], c = bits.Add(p[0], q[0], c) | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADCS",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE",-"ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	r[1], c = bits.Add(p[1], q[1], c) | 
					
						
							|  |  |  | 	r[2], c = bits.Add(p[2], q[2], c) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64(x, y, ci uint64) (r, co uint64) { | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"NEGL","ADCQ","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-24 14:33:50 -03:00
										 |  |  | 	// ppc64: "ADDC", "ADDE", "ADDZE" | 
					
						
							|  |  |  | 	// ppc64le: "ADDC", "ADDE", "ADDZE" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE","ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	return bits.Add64(x, y, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64C(x, ci uint64) (r, co uint64) { | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADDS","ADCS","ADC",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"NEGL","ADCQ","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-24 14:33:50 -03:00
										 |  |  | 	// ppc64: "ADDC", "ADDE", "ADDZE" | 
					
						
							|  |  |  | 	// ppc64le: "ADDC", "ADDE", "ADDZE" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE","ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	return bits.Add64(x, 7, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64Z(x, y uint64) (r, co uint64) { | 
					
						
							| 
									
										
										
										
											2019-03-21 03:24:47 +00:00
										 |  |  | 	// arm64:"ADDS","ADC",-"ADCS",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"ADDQ","SBBQ","NEGQ",-"NEGL",-"ADCQ" | 
					
						
							| 
									
										
										
										
											2019-04-24 14:33:50 -03:00
										 |  |  | 	// ppc64: "ADDC", "ADDE", "ADDZE" | 
					
						
							|  |  |  | 	// ppc64le: "ADDC", "ADDE", "ADDZE" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDC",-"ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	return bits.Add64(x, y, 0) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64R(x, y, ci uint64) uint64 { | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADDS","ADCS",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"NEGL","ADCQ",-"SBBQ",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-24 14:33:50 -03:00
										 |  |  | 	// ppc64: "ADDC", "ADDE", "ADDZE" | 
					
						
							|  |  |  | 	// ppc64le: "ADDC", "ADDE", "ADDZE" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE","ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	r, _ := bits.Add64(x, y, ci) | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | func Add64M(p, q, r *[3]uint64) { | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Add64(p[0], q[0], c) | 
					
						
							| 
									
										
										
										
											2019-01-14 09:36:18 +00:00
										 |  |  | 	// arm64:"ADCS",-"ADD\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	// amd64:"ADCQ",-"NEGL",-"SBBQ",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-04-24 14:33:50 -03:00
										 |  |  | 	// ppc64: "ADDC", "ADDE", "ADDZE" | 
					
						
							|  |  |  | 	// ppc64le: "ADDC", "ADDE", "ADDZE" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"ADDE",-"ADDC\t[$]-1," | 
					
						
							| 
									
										
										
										
											2018-10-23 14:05:38 -07:00
										 |  |  | 	r[1], c = bits.Add64(p[1], q[1], c) | 
					
						
							|  |  |  | 	r[2], c = bits.Add64(p[2], q[2], c) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-02-17 03:43:33 -08:00
										 |  |  | func Add64PanicOnOverflowEQ(a, b uint64) uint64 { | 
					
						
							|  |  |  | 	r, c := bits.Add64(a, b, 0) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]3,",-"ADDE" | 
					
						
							|  |  |  | 	if c == 1 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64PanicOnOverflowNE(a, b uint64) uint64 { | 
					
						
							|  |  |  | 	r, c := bits.Add64(a, b, 0) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]3,",-"ADDE" | 
					
						
							|  |  |  | 	if c != 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64PanicOnOverflowGT(a, b uint64) uint64 { | 
					
						
							|  |  |  | 	r, c := bits.Add64(a, b, 0) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]3,",-"ADDE" | 
					
						
							|  |  |  | 	if c > 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64MPanicOnOverflowEQ(a, b [2]uint64) [2]uint64 { | 
					
						
							|  |  |  | 	var r [2]uint64 | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Add64(a[0], b[0], c) | 
					
						
							|  |  |  | 	r[1], c = bits.Add64(a[1], b[1], c) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]3," | 
					
						
							|  |  |  | 	if c == 1 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64MPanicOnOverflowNE(a, b [2]uint64) [2]uint64 { | 
					
						
							|  |  |  | 	var r [2]uint64 | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Add64(a[0], b[0], c) | 
					
						
							|  |  |  | 	r[1], c = bits.Add64(a[1], b[1], c) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]3," | 
					
						
							|  |  |  | 	if c != 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Add64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 { | 
					
						
							|  |  |  | 	var r [2]uint64 | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Add64(a[0], b[0], c) | 
					
						
							|  |  |  | 	r[1], c = bits.Add64(a[1], b[1], c) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]3," | 
					
						
							|  |  |  | 	if c > 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | // --------------- // | 
					
						
							|  |  |  | //    bits.Sub*    // | 
					
						
							|  |  |  | // --------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub(x, y, ci uint) (r, co uint) { | 
					
						
							|  |  |  | 	// amd64:"NEGL","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	return bits.Sub(x, y, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func SubC(x, ci uint) (r, co uint) { | 
					
						
							|  |  |  | 	// amd64:"NEGL","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	return bits.Sub(x, 7, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func SubZ(x, y uint) (r, co uint) { | 
					
						
							|  |  |  | 	// amd64:"SUBQ","SBBQ","NEGQ",-"NEGL" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBC" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	return bits.Sub(x, y, 0) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func SubR(x, y, ci uint) uint { | 
					
						
							|  |  |  | 	// amd64:"NEGL","SBBQ",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	r, _ := bits.Sub(x, y, ci) | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | func SubM(p, q, r *[3]uint) { | 
					
						
							|  |  |  | 	var c uint | 
					
						
							|  |  |  | 	r[0], c = bits.Sub(p[0], q[0], c) | 
					
						
							|  |  |  | 	// amd64:"SBBQ",-"NEGL",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	r[1], c = bits.Sub(p[1], q[1], c) | 
					
						
							|  |  |  | 	r[2], c = bits.Sub(p[2], q[2], c) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64(x, y, ci uint64) (r, co uint64) { | 
					
						
							|  |  |  | 	// amd64:"NEGL","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	return bits.Sub64(x, y, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64C(x, ci uint64) (r, co uint64) { | 
					
						
							|  |  |  | 	// amd64:"NEGL","SBBQ","NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"NEGS","SBCS","NGC","NEG",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	return bits.Sub64(x, 7, ci) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64Z(x, y uint64) (r, co uint64) { | 
					
						
							|  |  |  | 	// amd64:"SUBQ","SBBQ","NEGQ",-"NEGL" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"SUBS","NGC","NEG",-"SBCS",-"ADD",-"SUB\t",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBC" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	return bits.Sub64(x, y, 0) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64R(x, y, ci uint64) uint64 { | 
					
						
							|  |  |  | 	// amd64:"NEGL","SBBQ",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"NEGS","SBCS",-"NGC",-"NEG\t",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	r, _ := bits.Sub64(x, y, ci) | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | func Sub64M(p, q, r *[3]uint64) { | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Sub64(p[0], q[0], c) | 
					
						
							|  |  |  | 	// amd64:"SBBQ",-"NEGL",-"NEGQ" | 
					
						
							| 
									
										
										
										
											2019-03-20 12:46:20 +00:00
										 |  |  | 	// arm64:"SBCS",-"NEGS",-"NGC",-"NEG",-"ADD",-"SUB",-"CMP" | 
					
						
							| 
									
										
										
										
											2019-04-30 17:46:23 +01:00
										 |  |  | 	// s390x:"SUBE" | 
					
						
							| 
									
										
										
										
											2018-10-23 14:38:22 -07:00
										 |  |  | 	r[1], c = bits.Sub64(p[1], q[1], c) | 
					
						
							|  |  |  | 	r[2], c = bits.Sub64(p[2], q[2], c) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-02-17 03:43:33 -08:00
										 |  |  | func Sub64PanicOnOverflowEQ(a, b uint64) uint64 { | 
					
						
							|  |  |  | 	r, b := bits.Sub64(a, b, 0) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE" | 
					
						
							|  |  |  | 	if b == 1 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64PanicOnOverflowNE(a, b uint64) uint64 { | 
					
						
							|  |  |  | 	r, b := bits.Sub64(a, b, 0) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE" | 
					
						
							|  |  |  | 	if b != 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64PanicOnOverflowGT(a, b uint64) uint64 { | 
					
						
							|  |  |  | 	r, b := bits.Sub64(a, b, 0) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]12,",-"ADDE",-"SUBE" | 
					
						
							|  |  |  | 	if b > 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64MPanicOnOverflowEQ(a, b [2]uint64) [2]uint64 { | 
					
						
							|  |  |  | 	var r [2]uint64 | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Sub64(a[0], b[0], c) | 
					
						
							|  |  |  | 	r[1], c = bits.Sub64(a[1], b[1], c) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]12," | 
					
						
							|  |  |  | 	if c == 1 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64MPanicOnOverflowNE(a, b [2]uint64) [2]uint64 { | 
					
						
							|  |  |  | 	var r [2]uint64 | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Sub64(a[0], b[0], c) | 
					
						
							|  |  |  | 	r[1], c = bits.Sub64(a[1], b[1], c) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]12," | 
					
						
							|  |  |  | 	if c != 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Sub64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 { | 
					
						
							|  |  |  | 	var r [2]uint64 | 
					
						
							|  |  |  | 	var c uint64 | 
					
						
							|  |  |  | 	r[0], c = bits.Sub64(a[0], b[0], c) | 
					
						
							|  |  |  | 	r[1], c = bits.Sub64(a[1], b[1], c) | 
					
						
							|  |  |  | 	// s390x:"BRC\t[$]12," | 
					
						
							|  |  |  | 	if c > 0 { | 
					
						
							|  |  |  | 		panic("overflow") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return r | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-08-14 16:41:22 -06:00
										 |  |  | // --------------- // | 
					
						
							|  |  |  | //    bits.Mul*    // | 
					
						
							|  |  |  | // --------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Mul(x, y uint) (hi, lo uint) { | 
					
						
							|  |  |  | 	// amd64:"MULQ" | 
					
						
							|  |  |  | 	// arm64:"UMULH","MUL" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64:"MULHDU","MULLD" | 
					
						
							|  |  |  | 	// ppc64le:"MULHDU","MULLD" | 
					
						
							| 
									
										
										
										
											2019-09-08 18:50:24 -04:00
										 |  |  | 	// s390x:"MLGR" | 
					
						
							| 
									
										
										
										
											2019-10-13 18:51:49 +08:00
										 |  |  | 	// mips64: "MULVU" | 
					
						
							| 
									
										
										
										
											2018-08-14 16:41:22 -06:00
										 |  |  | 	return bits.Mul(x, y) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Mul64(x, y uint64) (hi, lo uint64) { | 
					
						
							|  |  |  | 	// amd64:"MULQ" | 
					
						
							|  |  |  | 	// arm64:"UMULH","MUL" | 
					
						
							| 
									
										
										
										
											2018-10-15 12:53:07 -04:00
										 |  |  | 	// ppc64:"MULHDU","MULLD" | 
					
						
							|  |  |  | 	// ppc64le:"MULHDU","MULLD" | 
					
						
							| 
									
										
										
										
											2019-09-08 18:50:24 -04:00
										 |  |  | 	// s390x:"MLGR" | 
					
						
							| 
									
										
										
										
											2019-10-13 18:51:49 +08:00
										 |  |  | 	// mips64: "MULVU" | 
					
						
							| 
									
										
										
										
											2018-08-14 16:41:22 -06:00
										 |  |  | 	return bits.Mul64(x, y) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2018-10-23 20:54:56 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | // --------------- // | 
					
						
							|  |  |  | //    bits.Div*    // | 
					
						
							|  |  |  | // --------------- // | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func Div(hi, lo, x uint) (q, r uint) { | 
					
						
							|  |  |  | 	// amd64:"DIVQ" | 
					
						
							|  |  |  | 	return bits.Div(hi, lo, x) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												cmd/compile: optimize math/bits.Div32 for arm64
Benchmark:
name     old time/op  new time/op  delta
Div-8    22.0ns ± 0%  22.0ns ± 0%     ~     (all equal)
Div32-8  6.51ns ± 0%  3.00ns ± 0%  -53.90%  (p=0.000 n=10+8)
Div64-8  22.5ns ± 0%  22.5ns ± 0%     ~     (all equal)
Code:
func div32(hi, lo, y uint32) (q, r uint32) {return bits.Div32(hi, lo, y)}
Before:
        0x0020 00032 (test.go:24)       MOVWU   "".y+8(FP), R0
        0x0024 00036 ($GOROOT/src/math/bits/bits.go:472)        CBZW    R0, 132
        0x0028 00040 ($GOROOT/src/math/bits/bits.go:472)        MOVWU   "".hi(FP), R1
        0x002c 00044 ($GOROOT/src/math/bits/bits.go:472)        CMPW    R1, R0
        0x0030 00048 ($GOROOT/src/math/bits/bits.go:472)        BLS     96
        0x0034 00052 ($GOROOT/src/math/bits/bits.go:475)        MOVWU   "".lo+4(FP), R2
        0x0038 00056 ($GOROOT/src/math/bits/bits.go:475)        ORR     R1<<32, R2, R1
        0x003c 00060 ($GOROOT/src/math/bits/bits.go:476)        CBZ     R0, 140
        0x0040 00064 ($GOROOT/src/math/bits/bits.go:476)        UDIV    R0, R1, R2
        0x0044 00068 (test.go:24)       MOVW    R2, "".q+16(FP)
        0x0048 00072 ($GOROOT/src/math/bits/bits.go:476)        UREM    R0, R1, R0
        0x0050 00080 (test.go:24)       MOVW    R0, "".r+20(FP)
        0x0054 00084 (test.go:24)       MOVD    -8(RSP), R29
        0x0058 00088 (test.go:24)       MOVD.P  32(RSP), R30
        0x005c 00092 (test.go:24)       RET     (R30)
After:
        0x001c 00028 (test.go:24)       MOVWU   "".y+8(FP), R0
        0x0020 00032 (test.go:24)       CBZW    R0, 92
        0x0024 00036 (test.go:24)       MOVWU   "".hi(FP), R1
        0x0028 00040 (test.go:24)       CMPW    R0, R1
        0x002c 00044 (test.go:24)       BHS     84
        0x0030 00048 (test.go:24)       MOVWU   "".lo+4(FP), R2
        0x0034 00052 (test.go:24)       ORR     R1<<32, R2, R4
        0x0038 00056 (test.go:24)       UDIV    R0, R4, R3
        0x003c 00060 (test.go:24)       MSUB    R3, R4, R0, R4
        0x0040 00064 (test.go:24)       MOVW    R3, "".q+16(FP)
        0x0044 00068 (test.go:24)       MOVW    R4, "".r+20(FP)
        0x0048 00072 (test.go:24)       MOVD    -8(RSP), R29
        0x004c 00076 (test.go:24)       MOVD.P  16(RSP), R30
        0x0050 00080 (test.go:24)       RET     (R30)
UREM instruction in the previous assembly code will be converted to UDIV and MSUB instructions
on arm64. However the UDIV instruction in UREM is unnecessary, because it's a duplicate of the
previous UDIV. This CL adds a rule to have this extra UDIV instruction removed by CSE.
Change-Id: Ie2508784320020b2de022806d09f75a7871bb3d7
Reviewed-on: https://go-review.googlesource.com/c/159577
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Bryan C. Mills <bcmills@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-01-22 09:10:59 +00:00
										 |  |  | func Div32(hi, lo, x uint32) (q, r uint32) { | 
					
						
							|  |  |  | 	// arm64:"ORR","UDIV","MSUB",-"UREM" | 
					
						
							|  |  |  | 	return bits.Div32(hi, lo, x) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-23 20:54:56 -06:00
										 |  |  | func Div64(hi, lo, x uint64) (q, r uint64) { | 
					
						
							|  |  |  | 	// amd64:"DIVQ" | 
					
						
							|  |  |  | 	return bits.Div64(hi, lo, x) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2019-04-20 11:09:34 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | func Div64degenerate(x uint64) (q, r uint64) { | 
					
						
							|  |  |  | 	// amd64:-"DIVQ" | 
					
						
							|  |  |  | 	return bits.Div64(0, x, 5) | 
					
						
							|  |  |  | } |