internal/bytealg: move short string Index implementations into bytealg

Also move the arm64 CountByte implementation while we're here. Fixes #19792 Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e Reviewed-on: https://go-review.googlesource.com/98518 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2025-12-08 06:10:04 +00:00 · 2018-03-04 09:47:47 -08:00 · 2018-03-04 09:47:47 -08:00 · ee58eccc56
commit ee58eccc56
parent f6332bb84a
27 changed files with 932 additions and 1123 deletions
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@ -829,6 +829,92 @@ func EqualFold(s, t []byte) bool {
 	return len(s) == len(t)
 }
 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
 func Index(s, sep []byte) int {
 	n := len(sep)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, sep[0])
 	case n == len(s):
 		if Equal(sep, s) {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= bytealg.MaxLen:
 		// Use brute force when s and sep both are small
 		if len(s) <= bytealg.MaxBruteForce {
 			return bytealg.Index(s, sep)
 		}
 		c := sep[0]
 		i := 0
 		t := s[:len(s)-n+1]
 		fails := 0
 		for i < len(t) {
 			if t[i] != c {
 				// IndexByte is faster than bytealg.Index, so use it as long as
 				// we're not getting lots of false positives.
 				o := IndexByte(t[i:], c)
 				if o < 0 {
 					return -1
 				}
 				i += o
 			}
 			if Equal(s[i:i+n], sep) {
 				return i
 			}
 			fails++
 			i++
 			// Switch to bytealg.Index when IndexByte produces too many false positives.
 			if fails > bytealg.Cutover(i) {
 				r := bytealg.Index(s[i:], sep)
 				if r >= 0 {
 					return r + i
 				}
 				return -1
 			}
 		}
 		return -1
 	}
 	c := sep[0]
 	i := 0
 	fails := 0
 	t := s[:len(s)-n+1]
 	for i < len(t) {
 		if t[i] != c {
 			o := IndexByte(t[i:], c)
 			if o < 0 {
 				break
 			}
 			i += o
 		}
 		if Equal(s[i:i+n], sep) {
 			return i
 		}
 		i++
 		fails++
 		if fails >= 4+i>>4 && i < len(t) {
 			// Give up on IndexByte, it isn't skipping ahead
 			// far enough to be better than Rabin-Karp.
 			// Experiments (using IndexPeriodic) suggest
 			// the cutover is about 16 byte skips.
 			// TODO: if large prefixes of sep are matching
 			// we should cutover at even larger average skips,
 			// because Equal becomes that much more expensive.
 			// This code does not take that effect into account.
 			j := indexRabinKarp(s[i:], sep)
 			if j < 0 {
 				return -1
 			}
 			return i + j
 		}
 	}
 	return -1
 }
 func indexRabinKarp(s, sep []byte) int {
 	// Rabin-Karp search
 	hashsep, pow := hashStr(sep)
--- a/src/bytes/bytes_amd64.go
+++ b/src/bytes/bytes_amd64.go
@ -1,79 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytes
 import "internal/cpu"
 //go:noescape
 // indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
 // indexShortStr requires 2 <= len(c) <= shortStringLen
 func indexShortStr(s, c []byte) int  // ../runtime/asm_amd64.s
 func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s
 var shortStringLen int
 func init() {
 	if cpu.X86.HasAVX2 {
 		shortStringLen = 63
 	} else {
 		shortStringLen = 31
 	}
 }
 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
 func Index(s, sep []byte) int {
 	n := len(sep)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, sep[0])
 	case n == len(s):
 		if Equal(sep, s) {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= shortStringLen:
 		// Use brute force when s and sep both are small
 		if len(s) <= 64 {
 			return indexShortStr(s, sep)
 		}
 		c := sep[0]
 		i := 0
 		t := s[:len(s)-n+1]
 		fails := 0
 		for i < len(t) {
 			if t[i] != c {
 				// IndexByte skips 16/32 bytes per iteration,
 				// so it's faster than indexShortStr.
 				o := IndexByte(t[i:], c)
 				if o < 0 {
 					return -1
 				}
 				i += o
 			}
 			if Equal(s[i:i+n], sep) {
 				return i
 			}
 			fails++
 			i++
 			// Switch to indexShortStr when IndexByte produces too many false positives.
 			// Too many means more that 1 error per 8 characters.
 			// Allow some errors in the beginning.
 			if fails > (i+16)/8 {
 				r := indexShortStr(s[i:], sep)
 				if r >= 0 {
 					return r + i
 				}
 				return -1
 			}
 		}
 		return -1
 	}
 	return indexRabinKarp(s, sep)
 }
--- a/src/bytes/bytes_arm64.go
+++ b/src/bytes/bytes_arm64.go
@ -1,72 +0,0 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytes
 func countByte(s []byte, c byte) int // bytes_arm64.s
 // 8 bytes can be completely loaded into 1 register.
 const shortStringLen = 8
 //go:noescape
 func indexShortStr(s, sep []byte) int
 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
 func Index(s, sep []byte) int {
 	n := len(sep)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, sep[0])
 	case n == len(s):
 		if Equal(sep, s) {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= shortStringLen:
 		// Use brute force when both s and sep are small.
 		// Empirical data shows that it can get better
 		// performance when len(s) <= 16.
 		if len(s) <= 16 {
 			return indexShortStr(s, sep)
 		}
 	}
 	c := sep[0]
 	i := 0
 	fails := 0
 	t := s[:len(s)-n+1]
 	for i < len(t) {
 		if t[i] != c {
 			o := IndexByte(t[i:], c)
 			if o < 0 {
 				break
 			}
 			i += o
 		}
 		if Equal(s[i:i+n], sep) {
 			return i
 		}
 		i++
 		fails++
 		if fails >= 4+i>>4 && i < len(t) {
 			// Give up on IndexByte, it isn't skipping ahead
 			// far enough to be better than Rabin-Karp.
 			// Experiments (using IndexPeriodic) suggest
 			// the cutover is about 16 byte skips.
 			// TODO: if large prefixes of sep are matching
 			// we should cutover at even larger average skips,
 			// because Equal becomes that much more expensive.
 			// This code does not take that effect into account.
 			j := indexRabinKarp(s[i:], sep)
 			if j < 0 {
 				return -1
 			}
 			return i + j
 		}
 	}
 	return -1
 }
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@ -1,59 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64,!s390x,!arm64
 package bytes
 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
 func Index(s, sep []byte) int {
 	n := len(sep)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, sep[0])
 	case n == len(s):
 		if Equal(sep, s) {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	}
 	c := sep[0]
 	i := 0
 	fails := 0
 	t := s[:len(s)-n+1]
 	for i < len(t) {
 		if t[i] != c {
 			o := IndexByte(t[i:], c)
 			if o < 0 {
 				break
 			}
 			i += o
 		}
 		if Equal(s[i:i+n], sep) {
 			return i
 		}
 		i++
 		fails++
 		if fails >= 4+i>>4 && i < len(t) {
 			// Give up on IndexByte, it isn't skipping ahead
 			// far enough to be better than Rabin-Karp.
 			// Experiments (using IndexPeriodic) suggest
 			// the cutover is about 16 byte skips.
 			// TODO: if large prefixes of sep are matching
 			// we should cutover at even larger average skips,
 			// because Equal becomes that much more expensive.
 			// This code does not take that effect into account.
 			j := indexRabinKarp(s[i:], sep)
 			if j < 0 {
 				return -1
 			}
 			return i + j
 		}
 	}
 	return -1
 }
--- a/src/bytes/bytes_s390x.go
+++ b/src/bytes/bytes_s390x.go
@ -1,80 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytes
 //go:noescape
 // indexShortStr returns the index of the first instance of sep in s,
 // or -1 if sep is not present in s.
 // indexShortStr requires 2 <= len(sep) <= shortStringLen
 func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
 // supportsVX reports whether the vector facility is available.
 // indexShortStr must not be called if the vector facility is not
 // available.
 func supportsVX() bool // ../runtime/asm_s390x.s
 var shortStringLen = -1
 func init() {
 	if supportsVX() {
 		shortStringLen = 64
 	}
 }
 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
 func Index(s, sep []byte) int {
 	n := len(sep)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, sep[0])
 	case n == len(s):
 		if Equal(sep, s) {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= shortStringLen:
 		// Use brute force when s and sep both are small
 		if len(s) <= 64 {
 			return indexShortStr(s, sep)
 		}
 		c := sep[0]
 		i := 0
 		t := s[:len(s)-n+1]
 		fails := 0
 		for i < len(t) {
 			if t[i] != c {
 				// IndexByte skips 16/32 bytes per iteration,
 				// so it's faster than indexShortStr.
 				o := IndexByte(t[i:], c)
 				if o < 0 {
 					return -1
 				}
 				i += o
 			}
 			if Equal(s[i:i+n], sep) {
 				return i
 			}
 			fails++
 			i++
 			// Switch to indexShortStr when IndexByte produces too many false positives.
 			// Too many means more that 1 error per 8 characters.
 			// Allow some errors in the beginning.
 			if fails > (i+16)/8 {
 				r := indexShortStr(s[i:], sep)
 				if r >= 0 {
 					return r + i
 				}
 				return -1
 			}
 		}
 		return -1
 	}
 	return indexRabinKarp(s, sep)
 }
--- a/src/cmd/vet/all/whitelist/amd64.txt
+++ b/src/cmd/vet/all/whitelist/amd64.txt
@ -1,20 +1,16 @@
 // amd64-specific vet whitelist. See readme.txt for details.
 internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
 internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
 // False positives.
 // Nothing much to do about cross-package assembly. Unfortunate.
 internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
 internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
 // reflect trampolines intentionally omit arg size. Same for morestack.
 runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
 runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame
 runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
 // Nothing much to do about cross-package assembly. Unfortunate.
 runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings
 runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
 // Intentionally missing declarations. These are special assembly routines.
 // Some are jumped into from other routines, with values in specific registers.
 // duff* have direct calls from the compiler.
@ -25,4 +21,3 @@ runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go de
 runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
 runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
 runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
 runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration
--- a/src/cmd/vet/all/whitelist/s390x.txt
+++ b/src/cmd/vet/all/whitelist/s390x.txt
@ -1,11 +1,6 @@
 runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
 internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
 internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime
 runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings
 runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes
 runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings
 runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes
 runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration
 runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration
 runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration
 runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration
--- a/src/internal/bytealg/bytealg.go
+++ b/src/internal/bytealg/bytealg.go
@ -0,0 +1,22 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytealg
 import (
 	"internal/cpu"
 	"unsafe"
 )
 // Offsets into internal/cpu records for use in assembly.
 const (
 	x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
 	x86_HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
 	x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
 	x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
 	s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
 )
 // MaxLen is the maximum length of the string to be searched for (argument b) in Index.
 var MaxLen int
--- a/src/internal/bytealg/count_arm64.s
+++ b/src/internal/bytealg/count_arm64.s
@ -0,0 +1,90 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 #include "go_asm.h"
 #include "textflag.h"
 TEXT ·Count(SB),NOSPLIT,$0-40
 	MOVD	b_base+0(FP), R0
 	MOVD	b_len+8(FP), R2
 	MOVBU	c+24(FP), R1
 	MOVD	$ret+32(FP), R8
 	B	countbytebody<>(SB)
 TEXT ·CountString(SB),NOSPLIT,$0-32
 	MOVD	s_base+0(FP), R0
 	MOVD	s_len+8(FP), R2
 	MOVBU	c+16(FP), R1
 	MOVD	$ret+24(FP), R8
 	B	countbytebody<>(SB)
 // input:
 //   R0: data
 //   R2: data len
 //   R1: byte to find
 //   R8: address to put result
 TEXT countbytebody<>(SB),NOSPLIT,$0
 	// R11 = count of byte to search
 	MOVD	$0, R11
 	// short path to handle 0-byte case
 	CBZ	R2, done
 	CMP	$0x20, R2
 	// jump directly to tail if length < 32
 	BLO	tail
 	ANDS	$0x1f, R0, R9
 	BEQ	chunk
 	// Work with not 32-byte aligned head
 	BIC	$0x1f, R0, R3
 	ADD	$0x20, R3
 head_loop:
 	MOVBU.P	1(R0), R5
 	CMP	R5, R1
 	CINC	EQ, R11, R11
 	SUB	$1, R2, R2
 	CMP	R0, R3
 	BNE	head_loop
 	// Work with 32-byte aligned chunks
 chunk:
 	BIC	$0x1f, R2, R9
 	// The first chunk can also be the last
 	CBZ	R9, tail
 	// R3 = end of 32-byte chunks
 	ADD	R0, R9, R3
 	MOVD	$1, R5
 	VMOV	R5, V5.B16
 	// R2 = length of tail
 	SUB	R9, R2, R2
 	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
 	VMOV	R1, V0.B16
 	// Clear the low 64-bit element of V7 and V8
 	VEOR	V7.B8, V7.B8, V7.B8
 	VEOR	V8.B8, V8.B8, V8.B8
 	// Count the target byte in 32-byte chunk
 chunk_loop:
 	VLD1.P	(R0), [V1.B16, V2.B16]
 	CMP	R0, R3
 	VCMEQ	V0.B16, V1.B16, V3.B16
 	VCMEQ	V0.B16, V2.B16, V4.B16
 	// Clear the higher 7 bits
 	VAND	V5.B16, V3.B16, V3.B16
 	VAND	V5.B16, V4.B16, V4.B16
 	// Count lanes match the requested byte
 	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
 	VUADDLV	V6.B16, V7
 	// Accumulate the count in low 64-bit element of V8 when inside the loop
 	VADD	V7, V8
 	BNE	chunk_loop
 	VMOV	V8.D[0], R6
 	ADD	R6, R11, R11
 	CBZ	R2, done
 tail:
 	// Work with tail shorter than 32 bytes
 	MOVBU.P	1(R0), R5
 	SUB	$1, R2, R2
 	CMP	R5, R1
 	CINC	EQ, R11, R11
 	CBNZ	R2, tail
 done:
 	MOVD	R11, (R8)
 	RET
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !amd64
+// +build !amd64,!arm64
 package bytealg
--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build amd64
+// +build amd64 arm64
 package bytealg
--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@ -4,24 +4,8 @@
 package bytealg
 import (
 	"internal/cpu"
 	"unsafe"
 )
 // Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
 // Because equal_native.go is unconditional, it's a good place to compute asm constants.
 // TODO: find a better way to do this?
 // Offsets into internal/cpu records for use in assembly.
 const (
 	x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
 	x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
 	x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
 	s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
 )
 //go:noescape
 func Equal(a, b []byte) bool
--- a/src/internal/bytealg/index_amd64.go
+++ b/src/internal/bytealg/index_amd64.go
@ -0,0 +1,26 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytealg
 import "internal/cpu"
 const MaxBruteForce = 64
 func init() {
 	if cpu.X86.HasAVX2 {
 		MaxLen = 63
 	} else {
 		MaxLen = 31
 	}
 }
 // Cutover reports the number of failures of IndexByte we should tolerate
 // before switching over to Index.
 // n is the number of bytes processed so far.
 // See the bytes.Index implementation for details.
 func Cutover(n int) int {
 	// 1 error per 8 characters, plus a few slop to start.
 	return (n + 16) / 8
 }
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@ -0,0 +1,274 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 #include "go_asm.h"
 #include "textflag.h"
 TEXT ·Index(SB),NOSPLIT,$0-56
 	MOVQ a_base+0(FP), DI
 	MOVQ a_len+8(FP), DX
 	MOVQ b_base+24(FP), BP
 	MOVQ b_len+32(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+48(FP), R11
 	JMP  indexbody<>(SB)
 TEXT ·IndexString(SB),NOSPLIT,$0-40
 	MOVQ a_base+0(FP), DI
 	MOVQ a_len+8(FP), DX
 	MOVQ b_base+16(FP), BP
 	MOVQ b_len+24(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+32(FP), R11
 	JMP  indexbody<>(SB)
 // AX: length of string, that we are searching for
 // DX: length of string, in which we are searching
 // DI: pointer to string, in which we are searching
 // BP: pointer to string, that we are searching for
 // R11: address, where to put return value
 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
 TEXT indexbody<>(SB),NOSPLIT,$0
 	CMPQ AX, DX
 	JA fail
 	CMPQ DX, $16
 	JAE sse42
 no_sse42:
 	CMPQ AX, $2
 	JA   _3_or_more
 	MOVW (BP), BP
 	LEAQ -1(DI)(DX*1), DX
 loop2:
 	MOVW (DI), SI
 	CMPW SI,BP
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop2
 	JMP fail
 _3_or_more:
 	CMPQ AX, $3
 	JA   _4_or_more
 	MOVW 1(BP), BX
 	MOVW (BP), BP
 	LEAQ -2(DI)(DX*1), DX
 loop3:
 	MOVW (DI), SI
 	CMPW SI,BP
 	JZ   partial_success3
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop3
 	JMP fail
 partial_success3:
 	MOVW 1(DI), SI
 	CMPW SI,BX
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop3
 	JMP fail
 _4_or_more:
 	CMPQ AX, $4
 	JA   _5_or_more
 	MOVL (BP), BP
 	LEAQ -3(DI)(DX*1), DX
 loop4:
 	MOVL (DI), SI
 	CMPL SI,BP
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop4
 	JMP fail
 _5_or_more:
 	CMPQ AX, $7
 	JA   _8_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	MOVL -4(BP)(AX*1), BX
 	MOVL (BP), BP
 loop5to7:
 	MOVL (DI), SI
 	CMPL SI,BP
 	JZ   partial_success5to7
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop5to7
 	JMP fail
 partial_success5to7:
 	MOVL -4(AX)(DI*1), SI
 	CMPL SI,BX
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop5to7
 	JMP fail
 _8_or_more:
 	CMPQ AX, $8
 	JA   _9_or_more
 	MOVQ (BP), BP
 	LEAQ -7(DI)(DX*1), DX
 loop8:
 	MOVQ (DI), SI
 	CMPQ SI,BP
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop8
 	JMP fail
 _9_or_more:
 	CMPQ AX, $15
 	JA   _16_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	MOVQ -8(BP)(AX*1), BX
 	MOVQ (BP), BP
 loop9to15:
 	MOVQ (DI), SI
 	CMPQ SI,BP
 	JZ   partial_success9to15
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop9to15
 	JMP fail
 partial_success9to15:
 	MOVQ -8(AX)(DI*1), SI
 	CMPQ SI,BX
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop9to15
 	JMP fail
 _16_or_more:
 	CMPQ AX, $16
 	JA   _17_or_more
 	MOVOU (BP), X1
 	LEAQ -15(DI)(DX*1), DX
 loop16:
 	MOVOU (DI), X2
 	PCMPEQB X1, X2
 	PMOVMSKB X2, SI
 	CMPQ  SI, $0xffff
 	JE   success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop16
 	JMP fail
 _17_or_more:
 	CMPQ AX, $31
 	JA   _32_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	MOVOU -16(BP)(AX*1), X0
 	MOVOU (BP), X1
 loop17to31:
 	MOVOU (DI), X2
 	PCMPEQB X1,X2
 	PMOVMSKB X2, SI
 	CMPQ  SI, $0xffff
 	JE   partial_success17to31
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop17to31
 	JMP fail
 partial_success17to31:
 	MOVOU -16(AX)(DI*1), X3
 	PCMPEQB X0, X3
 	PMOVMSKB X3, SI
 	CMPQ  SI, $0xffff
 	JE success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop17to31
 	JMP fail
 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
 // So no need to check cpuid
 _32_or_more:
 	CMPQ AX, $32
 	JA   _33_to_63
 	VMOVDQU (BP), Y1
 	LEAQ -31(DI)(DX*1), DX
 loop32:
 	VMOVDQU (DI), Y2
 	VPCMPEQB Y1, Y2, Y3
 	VPMOVMSKB Y3, SI
 	CMPL  SI, $0xffffffff
 	JE   success_avx2
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop32
 	JMP fail_avx2
 _33_to_63:
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	VMOVDQU -32(BP)(AX*1), Y0
 	VMOVDQU (BP), Y1
 loop33to63:
 	VMOVDQU (DI), Y2
 	VPCMPEQB Y1, Y2, Y3
 	VPMOVMSKB Y3, SI
 	CMPL  SI, $0xffffffff
 	JE   partial_success33to63
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop33to63
 	JMP fail_avx2
 partial_success33to63:
 	VMOVDQU -32(AX)(DI*1), Y3
 	VPCMPEQB Y0, Y3, Y4
 	VPMOVMSKB Y4, SI
 	CMPL  SI, $0xffffffff
 	JE success_avx2
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop33to63
 fail_avx2:
 	VZEROUPPER
 fail:
 	MOVQ $-1, (R11)
 	RET
 success_avx2:
 	VZEROUPPER
 	JMP success
 sse42:
 	CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
 	JNE no_sse42
 	CMPQ AX, $12
 	// PCMPESTRI is slower than normal compare,
 	// so using it makes sense only if we advance 4+ bytes per compare
 	// This value was determined experimentally and is the ~same
 	// on Nehalem (first with SSE42) and Haswell.
 	JAE _9_or_more
 	LEAQ 16(BP), SI
 	TESTW $0xff0, SI
 	JEQ no_sse42
 	MOVOU (BP), X1
 	LEAQ -15(DI)(DX*1), SI
 	MOVQ $16, R9
 	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
 loop_sse42:
 	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
 	// for equality (bits 2,3 are 11)
 	// result is not masked or inverted (bits 4,5 are 00)
 	// and corresponds to first matching byte (bit 6 is 0)
 	PCMPESTRI $0x0c, (DI), X1
 	// CX == 16 means no match,
 	// CX > R9 means partial match at the end of the string,
 	// otherwise sep is at offset CX from X1 start
 	CMPQ CX, R9
 	JBE sse42_success
 	ADDQ R9, DI
 	CMPQ DI, SI
 	JB loop_sse42
 	PCMPESTRI $0x0c, -1(SI), X1
 	CMPQ CX, R9
 	JA fail
 	LEAQ -1(SI), DI
 sse42_success:
 	ADDQ CX, DI
 success:
 	SUBQ R10, DI
 	MOVQ DI, (R11)
 	RET
--- a/src/internal/bytealg/index_arm64.go
+++ b/src/internal/bytealg/index_arm64.go
@ -0,0 +1,23 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytealg
 // Empirical data shows that using IndexShortStr can get better
 // performance when len(s) <= 16.
 const MaxBruteForce = 16
 func init() {
 	// 8 bytes can be completely loaded into 1 register.
 	MaxLen = 8
 }
 // Cutover reports the number of failures of IndexByte we should tolerate
 // before switching over to IndexShortStr.
 // n is the number of bytes processed so far.
 // See the bytes.Index implementation for details.
 func Cutover(n int) int {
 	// 1 error per 16 characters, plus a few slop to start.
 	return 4 + n>>4
 }
--- a/src/internal/bytealg/index_arm64.s
+++ b/src/internal/bytealg/index_arm64.s
@ -1,88 +1,40 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 #include "go_asm.h"
 #include "textflag.h"
-// countByte(s []byte, c byte) int
+TEXT ·Index(SB),NOSPLIT,$0-56
-TEXT bytes·countByte(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
-	MOVD	s_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
-	MOVD	s_len+8(FP), R2
+	MOVD	b_base+24(FP), R2
-	MOVBU	c+24(FP), R1
+	MOVD	b_len+32(FP), R3
-	// R11 = count of byte to search
+	MOVD	$ret+48(FP), R9
-	MOVD	$0, R11
+	B	indexbody<>(SB)
 	// short path to handle 0-byte case
 	CBZ	R2, done
 	CMP	$0x20, R2
 	// jump directly to tail if length < 32
 	BLO	tail
 	ANDS	$0x1f, R0, R9
 	BEQ	chunk
 	// Work with not 32-byte aligned head
 	BIC	$0x1f, R0, R3
 	ADD	$0x20, R3
 head_loop:
 	MOVBU.P	1(R0), R5
 	CMP	R5, R1
 	CINC	EQ, R11, R11
 	SUB	$1, R2, R2
 	CMP	R0, R3
 	BNE	head_loop
 	// Work with 32-byte aligned chunks
 chunk:
 	BIC	$0x1f, R2, R9
 	// The first chunk can also be the last
 	CBZ	R9, tail
 	// R3 = end of 32-byte chunks
 	ADD	R0, R9, R3
 	MOVD	$1, R5
 	VMOV	R5, V5.B16
 	// R2 = length of tail
 	SUB	R9, R2, R2
 	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
 	VMOV	R1, V0.B16
 	// Clear the low 64-bit element of V7 and V8
 	VEOR	V7.B8, V7.B8, V7.B8
 	VEOR	V8.B8, V8.B8, V8.B8
 	// Count the target byte in 32-byte chunk
 chunk_loop:
 	VLD1.P	(R0), [V1.B16, V2.B16]
 	CMP	R0, R3
 	VCMEQ	V0.B16, V1.B16, V3.B16
 	VCMEQ	V0.B16, V2.B16, V4.B16
 	// Clear the higher 7 bits
 	VAND	V5.B16, V3.B16, V3.B16
 	VAND	V5.B16, V4.B16, V4.B16
 	// Count lanes match the requested byte
 	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
 	VUADDLV	V6.B16, V7
 	// Accumulate the count in low 64-bit element of V8 when inside the loop
 	VADD	V7, V8
 	BNE	chunk_loop
 	VMOV	V8.D[0], R6
 	ADD	R6, R11, R11
 	CBZ	R2, done
 tail:
 	// Work with tail shorter than 32 bytes
 	MOVBU.P	1(R0), R5
 	SUB	$1, R2, R2
 	CMP	R5, R1
 	CINC	EQ, R11, R11
 	CBNZ	R2, tail
 done:
 	MOVD	R11, ret+32(FP)
 	RET
-// indexShortStr(s, sep []byte) int
+TEXT ·IndexString(SB),NOSPLIT,$0-40
-// precondition: 2 <= len(sep) <= 8
+	MOVD	a_base+0(FP), R0
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
+	MOVD	a_len+8(FP), R1
 	MOVD	b_base+16(FP), R2
 	MOVD	b_len+24(FP), R3
 	MOVD	$ret+32(FP), R9
 	B	indexbody<>(SB)
 // input:
 //   R0: haystack
 //   R1: length of haystack
 //   R2: needle
 //   R3: length of needle (2 <= len <= 8)
 //   R9: address to put result
 TEXT indexbody<>(SB),NOSPLIT,$0-56
 	// main idea is to load 'sep' into separate register(s)
 	// to avoid repeatedly re-load it again and again
 	// for sebsequent substring comparisons
-	MOVD	s+0(FP), R0
+	MOVD	a_base+0(FP), R0
-	MOVD	s_len+8(FP), R1
+	MOVD	a_len+8(FP), R1
-	MOVD	sep+24(FP), R2
+	MOVD	b_base+24(FP), R2
-	MOVD	sep_len+32(FP), R3
+	MOVD	b_len+32(FP), R3
 	SUB	R3, R1, R4
 	// R4 contains the start of last substring for comparsion
 	ADD	R0, R4, R4
@ -189,9 +141,9 @@ loop_2:
 	BLS	loop_2
 not_found:
 	MOVD	$-1, R0
-	MOVD	R0, ret+48(FP)
+	MOVD	R0, (R9)
 	RET
 found:
 	SUB	R8, R0, R0
-	MOVD	R0, ret+48(FP)
+	MOVD	R0, (R9)
 	RET
--- a/src/internal/bytealg/index_generic.go
+++ b/src/internal/bytealg/index_generic.go
@ -0,0 +1,29 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64,!arm64,!s390x
 package bytealg
 const MaxBruteForce = 0
 // Index returns the index of the first instance of b in a, or -1 if b is not present in a.
 // Requires 2 <= len(b) <= MaxLen.
 func Index(a, b []byte) int {
 	panic("unimplemented")
 }
 // IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
 // Requires 2 <= len(b) <= MaxLen.
 func IndexString(a, b string) int {
 	panic("unimplemented")
 }
 // Cutover reports the number of failures of IndexByte we should tolerate
 // before switching over to Index.
 // n is the number of bytes processed so far.
 // See the bytes.Index implementation for details.
 func Cutover(n int) int {
 	panic("unimplemented")
 }
--- a/src/internal/bytealg/index_native.go
+++ b/src/internal/bytealg/index_native.go
@ -0,0 +1,19 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64 arm64 s390x
 package bytealg
 //go:noescape
 // Index returns the index of the first instance of b in a, or -1 if b is not present in a.
 // Requires 2 <= len(b) <= MaxLen.
 func Index(a, b []byte) int
 //go:noescape
 // IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
 // Requires 2 <= len(b) <= MaxLen.
 func IndexString(a, b string) int
--- a/src/internal/bytealg/index_s390x.go
+++ b/src/internal/bytealg/index_s390x.go
@ -0,0 +1,31 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package bytealg
 import "internal/cpu"
 const MaxBruteForce = 64
 func init() {
 	// Note: we're kind of lucky that this flag is available at this point.
 	// The runtime sets HasVX when processing auxv records, and that happens
 	// to happen *before* running the init functions of packages that
 	// the runtime depends on.
 	// TODO: it would really be nicer for internal/cpu to figure out this
 	// flag by itself. Then we wouldn't need to depend on quirks of
 	// early startup initialization order.
 	if cpu.S390X.HasVX {
 		MaxLen = 64
 	}
 }
 // Cutover reports the number of failures of IndexByte we should tolerate
 // before switching over to Index.
 // n is the number of bytes processed so far.
 // See the bytes.Index implementation for details.
 func Cutover(n int) int {
 	// 1 error per 8 characters, plus a few slop to start.
 	return (n + 16) / 8
 }
--- a/src/internal/bytealg/index_s390x.s
+++ b/src/internal/bytealg/index_s390x.s
@ -0,0 +1,216 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 #include "go_asm.h"
 #include "textflag.h"
 // Caller must confirm availability of vx facility before calling.
 TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
 	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
 	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
 	MOVD	$ret+48(FP), R5
 	BR	indexbody<>(SB)
 // Caller must confirm availability of vx facility before calling.
 TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
 	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
 	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
 	MOVD	$ret+32(FP), R5
 	BR	indexbody<>(SB)
 // s: string we are searching
 // sep: string to search for
 // R1=&s[0], R2=len(s)
 // R3=&sep[0], R4=len(sep)
 // R5=&ret (int)
 // Caller must confirm availability of vx facility before calling.
 TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
 	CMPBGT	R4, R2, notfound
 	ADD	R1, R2
 	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
 	CMPBEQ	R4, $0, notfound
 	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
 	VLL	R4, (R3), V0 // contains first 16 bytes of sep
 	MOVD	R1, R7
 index2plus:
 	CMPBNE	R4, $1, index3plus
 	MOVD	$15(R7), R9
 	CMPBGE	R9, R2, index2to16
 	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
 	VONE	V16
 	VREPH	$0, V0, V1
 	CMPBGE	R9, R2, index2to16
 index2loop:
 	VL	0(R7), V2          // 16 bytes, even indices
 	VL	1(R7), V4          // 16 bytes, odd indices
 	VCEQH	V1, V2, V5         // compare even indices
 	VCEQH	V1, V4, V6         // compare odd indices
 	VSEL	V5, V6, V31, V7    // merge even and odd indices
 	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
 	BLT	foundV17
 	MOVD	$16(R7), R7        // R7+=16
 	ADD	$15, R7, R9
 	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
 	CMPBLE	R7, R2, index2to16
 	BR	notfound
 index3plus:
 	CMPBNE	R4, $2, index4plus
 	ADD	$15, R7, R9
 	CMPBGE	R9, R2, index2to16
 	MOVD	$1, R0
 	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
 	VONE	V16
 	VREPH	$0, V0, V1
 	VREPB	$2, V0, V8
 index3loop:
 	VL	(R7), V2           // load 16-bytes into V2
 	VLL	R0, 16(R7), V3     // load 2-bytes into V3
 	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
 	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
 	VCEQH	V1, V2, V5         // compare 2-byte even indices
 	VCEQH	V1, V4, V6         // compare 2-byte odd indices
 	VCEQB	V8, V9, V10        // compare last bytes
 	VSEL	V5, V6, V31, V7    // merge even and odd indices
 	VN	V7, V10, V7        // AND indices with last byte
 	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
 	BLT	foundV17
 	MOVD	$16(R7), R7        // R7+=16
 	ADD	$15, R7, R9
 	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
 	CMPBLE	R7, R2, index2to16
 	BR	notfound
 index4plus:
 	CMPBNE	R4, $3, index5plus
 	ADD	$15, R7, R9
 	CMPBGE	R9, R2, index2to16
 	MOVD	$2, R0
 	VGBM	$0x8888, V29       // 0xff000000ff000000...
 	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
 	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
 	VONE	V16
 	VREPF	$0, V0, V1
 index4loop:
 	VL	(R7), V2           // load 16-bytes into V2
 	VLL	R0, 16(R7), V3     // load 3-bytes into V3
 	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
 	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
 	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
 	VCEQF	V1, V2, V5         // compare index 0, 4, ...
 	VCEQF	V1, V4, V6         // compare index 1, 5, ...
 	VCEQF	V1, V9, V11        // compare index 2, 6, ...
 	VCEQF	V1, V10, V12       // compare index 3, 7, ...
 	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
 	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
 	VSEL	V13, V14, V31, V7  // final merge
 	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
 	BLT	foundV17
 	MOVD	$16(R7), R7        // R7+=16
 	ADD	$15, R7, R9
 	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
 	CMPBLE	R7, R2, index2to16
 	BR	notfound
 index5plus:
 	CMPBGT	R4, $15, index17plus
 index2to16:
 	CMPBGT	R7, R2, notfound
 	MOVD	$1(R7), R8
 	CMPBGT	R8, R2, index2to16tail
 index2to16loop:
 	// unrolled 2x
 	VLL	R4, (R7), V1
 	VLL	R4, 1(R7), V2
 	VCEQGS	V0, V1, V3
 	BEQ	found
 	MOVD	$1(R7), R7
 	VCEQGS	V0, V2, V4
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLT	R7, R2, index2to16loop
 	CMPBGT	R7, R2, notfound
 index2to16tail:
 	VLL	R4, (R7), V1
 	VCEQGS	V0, V1, V2
 	BEQ	found
 	BR	notfound
 index17plus:
 	CMPBGT	R4, $31, index33plus
 	SUB	$16, R4, R0
 	VLL	R0, 16(R3), V1
 	VONE	V7
 index17to32loop:
 	VL	(R7), V2
 	VLL	R0, 16(R7), V3
 	VCEQG	V0, V2, V4
 	VCEQG	V1, V3, V5
 	VN	V4, V5, V6
 	VCEQGS	V6, V7, V8
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLE  R7, R2, index17to32loop
 	BR	notfound
 index33plus:
 	CMPBGT	R4, $47, index49plus
 	SUB	$32, R4, R0
 	VL	16(R3), V1
 	VLL	R0, 32(R3), V2
 	VONE	V11
 index33to48loop:
 	VL	(R7), V3
 	VL	16(R7), V4
 	VLL	R0, 32(R7), V5
 	VCEQG	V0, V3, V6
 	VCEQG	V1, V4, V7
 	VCEQG	V2, V5, V8
 	VN	V6, V7, V9
 	VN	V8, V9, V10
 	VCEQGS	V10, V11, V12
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLE  R7, R2, index33to48loop
 	BR	notfound
 index49plus:
 	CMPBGT	R4, $63, index65plus
 	SUB	$48, R4, R0
 	VL	16(R3), V1
 	VL	32(R3), V2
 	VLL	R0, 48(R3), V3
 	VONE	V15
 index49to64loop:
 	VL	(R7), V4
 	VL	16(R7), V5
 	VL	32(R7), V6
 	VLL	R0, 48(R7), V7
 	VCEQG	V0, V4, V8
 	VCEQG	V1, V5, V9
 	VCEQG	V2, V6, V10
 	VCEQG	V3, V7, V11
 	VN	V8, V9, V12
 	VN	V10, V11, V13
 	VN	V12, V13, V14
 	VCEQGS	V14, V15, V16
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLE  R7, R2, index49to64loop
 notfound:
 	MOVD	$-1, (R5)
 	RET
 index65plus:
 	// not implemented
 	MOVD	$0, (R0)
 	RET
 foundV17: // index is in doubleword V17[0]
 	VLGVG	$0, V17, R8
 	ADD	R8, R7
 found:
 	SUB	R1, R7
 	MOVD	R7, (R5)
 	RET
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
 GLOBL shifts<>(SB),RODATA,$256
 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
 	MOVQ s+0(FP), DI
 	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
 	MOVQ s_len+8(FP), DX
 	MOVQ c+16(FP), BP
 	MOVQ c_len+24(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+32(FP), R11
 	JMP  runtime·indexShortStr(SB)
 TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
 	MOVQ s+0(FP), DI
 	MOVQ s_len+8(FP), DX
 	MOVQ c+24(FP), BP
 	MOVQ c_len+32(FP), AX
 	MOVQ DI, R10
 	LEAQ ret+48(FP), R11
 	JMP  runtime·indexShortStr(SB)
 // AX: length of string, that we are searching for
 // DX: length of string, in which we are searching
 // DI: pointer to string, in which we are searching
 // BP: pointer to string, that we are searching for
 // R11: address, where to put return value
 TEXT runtime·indexShortStr(SB),NOSPLIT,$0
 	CMPQ AX, DX
 	JA fail
 	CMPQ DX, $16
 	JAE sse42
 no_sse42:
 	CMPQ AX, $2
 	JA   _3_or_more
 	MOVW (BP), BP
 	LEAQ -1(DI)(DX*1), DX
 loop2:
 	MOVW (DI), SI
 	CMPW SI,BP
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop2
 	JMP fail
 _3_or_more:
 	CMPQ AX, $3
 	JA   _4_or_more
 	MOVW 1(BP), BX
 	MOVW (BP), BP
 	LEAQ -2(DI)(DX*1), DX
 loop3:
 	MOVW (DI), SI
 	CMPW SI,BP
 	JZ   partial_success3
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop3
 	JMP fail
 partial_success3:
 	MOVW 1(DI), SI
 	CMPW SI,BX
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop3
 	JMP fail
 _4_or_more:
 	CMPQ AX, $4
 	JA   _5_or_more
 	MOVL (BP), BP
 	LEAQ -3(DI)(DX*1), DX
 loop4:
 	MOVL (DI), SI
 	CMPL SI,BP
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop4
 	JMP fail
 _5_or_more:
 	CMPQ AX, $7
 	JA   _8_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	MOVL -4(BP)(AX*1), BX
 	MOVL (BP), BP
 loop5to7:
 	MOVL (DI), SI
 	CMPL SI,BP
 	JZ   partial_success5to7
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop5to7
 	JMP fail
 partial_success5to7:
 	MOVL -4(AX)(DI*1), SI
 	CMPL SI,BX
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop5to7
 	JMP fail
 _8_or_more:
 	CMPQ AX, $8
 	JA   _9_or_more
 	MOVQ (BP), BP
 	LEAQ -7(DI)(DX*1), DX
 loop8:
 	MOVQ (DI), SI
 	CMPQ SI,BP
 	JZ   success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop8
 	JMP fail
 _9_or_more:
 	CMPQ AX, $15
 	JA   _16_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	MOVQ -8(BP)(AX*1), BX
 	MOVQ (BP), BP
 loop9to15:
 	MOVQ (DI), SI
 	CMPQ SI,BP
 	JZ   partial_success9to15
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop9to15
 	JMP fail
 partial_success9to15:
 	MOVQ -8(AX)(DI*1), SI
 	CMPQ SI,BX
 	JZ success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop9to15
 	JMP fail
 _16_or_more:
 	CMPQ AX, $16
 	JA   _17_or_more
 	MOVOU (BP), X1
 	LEAQ -15(DI)(DX*1), DX
 loop16:
 	MOVOU (DI), X2
 	PCMPEQB X1, X2
 	PMOVMSKB X2, SI
 	CMPQ  SI, $0xffff
 	JE   success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop16
 	JMP fail
 _17_or_more:
 	CMPQ AX, $31
 	JA   _32_or_more
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	MOVOU -16(BP)(AX*1), X0
 	MOVOU (BP), X1
 loop17to31:
 	MOVOU (DI), X2
 	PCMPEQB X1,X2
 	PMOVMSKB X2, SI
 	CMPQ  SI, $0xffff
 	JE   partial_success17to31
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop17to31
 	JMP fail
 partial_success17to31:
 	MOVOU -16(AX)(DI*1), X3
 	PCMPEQB X0, X3
 	PMOVMSKB X3, SI
 	CMPQ  SI, $0xffff
 	JE success
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop17to31
 	JMP fail
 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
 // So no need to check cpuid
 _32_or_more:
 	CMPQ AX, $32
 	JA   _33_to_63
 	VMOVDQU (BP), Y1
 	LEAQ -31(DI)(DX*1), DX
 loop32:
 	VMOVDQU (DI), Y2
 	VPCMPEQB Y1, Y2, Y3
 	VPMOVMSKB Y3, SI
 	CMPL  SI, $0xffffffff
 	JE   success_avx2
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop32
 	JMP fail_avx2
 _33_to_63:
 	LEAQ 1(DI)(DX*1), DX
 	SUBQ AX, DX
 	VMOVDQU -32(BP)(AX*1), Y0
 	VMOVDQU (BP), Y1
 loop33to63:
 	VMOVDQU (DI), Y2
 	VPCMPEQB Y1, Y2, Y3
 	VPMOVMSKB Y3, SI
 	CMPL  SI, $0xffffffff
 	JE   partial_success33to63
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop33to63
 	JMP fail_avx2
 partial_success33to63:
 	VMOVDQU -32(AX)(DI*1), Y3
 	VPCMPEQB Y0, Y3, Y4
 	VPMOVMSKB Y4, SI
 	CMPL  SI, $0xffffffff
 	JE success_avx2
 	ADDQ $1,DI
 	CMPQ DI,DX
 	JB loop33to63
 fail_avx2:
 	VZEROUPPER
 fail:
 	MOVQ $-1, (R11)
 	RET
 success_avx2:
 	VZEROUPPER
 	JMP success
 sse42:
 	CMPB runtime·support_sse42(SB), $1
 	JNE no_sse42
 	CMPQ AX, $12
 	// PCMPESTRI is slower than normal compare,
 	// so using it makes sense only if we advance 4+ bytes per compare
 	// This value was determined experimentally and is the ~same
 	// on Nehalem (first with SSE42) and Haswell.
 	JAE _9_or_more
 	LEAQ 16(BP), SI
 	TESTW $0xff0, SI
 	JEQ no_sse42
 	MOVOU (BP), X1
 	LEAQ -15(DI)(DX*1), SI
 	MOVQ $16, R9
 	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
 loop_sse42:
 	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
 	// for equality (bits 2,3 are 11)
 	// result is not masked or inverted (bits 4,5 are 00)
 	// and corresponds to first matching byte (bit 6 is 0)
 	PCMPESTRI $0x0c, (DI), X1
 	// CX == 16 means no match,
 	// CX > R9 means partial match at the end of the string,
 	// otherwise sep is at offset CX from X1 start
 	CMPQ CX, R9
 	JBE sse42_success
 	ADDQ R9, DI
 	CMPQ DI, SI
 	JB loop_sse42
 	PCMPESTRI $0x0c, -1(SI), X1
 	CMPQ CX, R9
 	JA fail
 	LEAQ -1(SI), DI
 sse42_success:
 	ADDQ CX, DI
 success:
 	SUBQ R10, DI
 	MOVQ DI, (R11)
 	RET
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVL	$0, AX
 	RET
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@ -796,230 +796,6 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
        // compile barrier.
 	RET
 // func supportsVX() bool
 TEXT strings·supportsVX(SB),NOSPLIT,$0-1
 	MOVBZ	runtime·cpu+facilities_hasVX(SB), R0
 	MOVB	R0, ret+0(FP)
 	RET
 // func supportsVX() bool
 TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
 	MOVBZ	runtime·cpu+facilities_hasVX(SB), R0
 	MOVB	R0, ret+0(FP)
 	RET
 // func indexShortStr(s, sep string) int
 // Caller must confirm availability of vx facility before calling.
 TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
 	LMG	s+0(FP), R1, R2   // R1=&s[0],   R2=len(s)
 	LMG	sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
 	MOVD	$ret+32(FP), R5
 	BR	runtime·indexShortStr(SB)
 // func indexShortStr(s, sep []byte) int
 // Caller must confirm availability of vx facility before calling.
 TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
 	LMG	s+0(FP), R1, R2    // R1=&s[0],   R2=len(s)
 	LMG	sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
 	MOVD	$ret+48(FP), R5
 	BR	runtime·indexShortStr(SB)
 // s: string we are searching
 // sep: string to search for
 // R1=&s[0], R2=len(s)
 // R3=&sep[0], R4=len(sep)
 // R5=&ret (int)
 // Caller must confirm availability of vx facility before calling.
 TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
 	CMPBGT	R4, R2, notfound
 	ADD	R1, R2
 	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
 	CMPBEQ	R4, $0, notfound
 	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
 	VLL	R4, (R3), V0 // contains first 16 bytes of sep
 	MOVD	R1, R7
 index2plus:
 	CMPBNE	R4, $1, index3plus
 	MOVD	$15(R7), R9
 	CMPBGE	R9, R2, index2to16
 	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
 	VONE	V16
 	VREPH	$0, V0, V1
 	CMPBGE	R9, R2, index2to16
 index2loop:
 	VL	0(R7), V2          // 16 bytes, even indices
 	VL	1(R7), V4          // 16 bytes, odd indices
 	VCEQH	V1, V2, V5         // compare even indices
 	VCEQH	V1, V4, V6         // compare odd indices
 	VSEL	V5, V6, V31, V7    // merge even and odd indices
 	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
 	BLT	foundV17
 	MOVD	$16(R7), R7        // R7+=16
 	ADD	$15, R7, R9
 	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
 	CMPBLE	R7, R2, index2to16
 	BR	notfound
 index3plus:
 	CMPBNE	R4, $2, index4plus
 	ADD	$15, R7, R9
 	CMPBGE	R9, R2, index2to16
 	MOVD	$1, R0
 	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
 	VONE	V16
 	VREPH	$0, V0, V1
 	VREPB	$2, V0, V8
 index3loop:
 	VL	(R7), V2           // load 16-bytes into V2
 	VLL	R0, 16(R7), V3     // load 2-bytes into V3
 	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
 	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
 	VCEQH	V1, V2, V5         // compare 2-byte even indices
 	VCEQH	V1, V4, V6         // compare 2-byte odd indices
 	VCEQB	V8, V9, V10        // compare last bytes
 	VSEL	V5, V6, V31, V7    // merge even and odd indices
 	VN	V7, V10, V7        // AND indices with last byte
 	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
 	BLT	foundV17
 	MOVD	$16(R7), R7        // R7+=16
 	ADD	$15, R7, R9
 	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
 	CMPBLE	R7, R2, index2to16
 	BR	notfound
 index4plus:
 	CMPBNE	R4, $3, index5plus
 	ADD	$15, R7, R9
 	CMPBGE	R9, R2, index2to16
 	MOVD	$2, R0
 	VGBM	$0x8888, V29       // 0xff000000ff000000...
 	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
 	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
 	VONE	V16
 	VREPF	$0, V0, V1
 index4loop:
 	VL	(R7), V2           // load 16-bytes into V2
 	VLL	R0, 16(R7), V3     // load 3-bytes into V3
 	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
 	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
 	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
 	VCEQF	V1, V2, V5         // compare index 0, 4, ...
 	VCEQF	V1, V4, V6         // compare index 1, 5, ...
 	VCEQF	V1, V9, V11        // compare index 2, 6, ...
 	VCEQF	V1, V10, V12       // compare index 3, 7, ...
 	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
 	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
 	VSEL	V13, V14, V31, V7  // final merge
 	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
 	BLT	foundV17
 	MOVD	$16(R7), R7        // R7+=16
 	ADD	$15, R7, R9
 	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
 	CMPBLE	R7, R2, index2to16
 	BR	notfound
 index5plus:
 	CMPBGT	R4, $15, index17plus
 index2to16:
 	CMPBGT	R7, R2, notfound
 	MOVD	$1(R7), R8
 	CMPBGT	R8, R2, index2to16tail
 index2to16loop:
 	// unrolled 2x
 	VLL	R4, (R7), V1
 	VLL	R4, 1(R7), V2
 	VCEQGS	V0, V1, V3
 	BEQ	found
 	MOVD	$1(R7), R7
 	VCEQGS	V0, V2, V4
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLT	R7, R2, index2to16loop
 	CMPBGT	R7, R2, notfound
 index2to16tail:
 	VLL	R4, (R7), V1
 	VCEQGS	V0, V1, V2
 	BEQ	found
 	BR	notfound
 index17plus:
 	CMPBGT	R4, $31, index33plus
 	SUB	$16, R4, R0
 	VLL	R0, 16(R3), V1
 	VONE	V7
 index17to32loop:
 	VL	(R7), V2
 	VLL	R0, 16(R7), V3
 	VCEQG	V0, V2, V4
 	VCEQG	V1, V3, V5
 	VN	V4, V5, V6
 	VCEQGS	V6, V7, V8
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLE  R7, R2, index17to32loop
 	BR	notfound
 index33plus:
 	CMPBGT	R4, $47, index49plus
 	SUB	$32, R4, R0
 	VL	16(R3), V1
 	VLL	R0, 32(R3), V2
 	VONE	V11
 index33to48loop:
 	VL	(R7), V3
 	VL	16(R7), V4
 	VLL	R0, 32(R7), V5
 	VCEQG	V0, V3, V6
 	VCEQG	V1, V4, V7
 	VCEQG	V2, V5, V8
 	VN	V6, V7, V9
 	VN	V8, V9, V10
 	VCEQGS	V10, V11, V12
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLE  R7, R2, index33to48loop
 	BR	notfound
 index49plus:
 	CMPBGT	R4, $63, index65plus
 	SUB	$48, R4, R0
 	VL	16(R3), V1
 	VL	32(R3), V2
 	VLL	R0, 48(R3), V3
 	VONE	V15
 index49to64loop:
 	VL	(R7), V4
 	VL	16(R7), V5
 	VL	32(R7), V6
 	VLL	R0, 48(R7), V7
 	VCEQG	V0, V4, V8
 	VCEQG	V1, V5, V9
 	VCEQG	V2, V6, V10
 	VCEQG	V3, V7, V11
 	VN	V8, V9, V12
 	VN	V10, V11, V13
 	VN	V12, V13, V14
 	VCEQGS	V14, V15, V16
 	BEQ	found
 	MOVD	$1(R7), R7
 	CMPBLE  R7, R2, index49to64loop
 notfound:
 	MOVD	$-1, (R5)
 	RET
 index65plus:
 	// not implemented
 	MOVD	$0, (R0)
 	RET
 foundV17: // index is in doubleword V17[0]
 	VLGVG	$0, V17, R8
 	ADD	R8, R7
 found:
 	SUB	R1, R7
 	MOVD	R7, (R5)
 	RET
 // This is called from .init_array and follows the platform, not Go, ABI.
 // We are overly conservative. We could only save the registers we use.
 // However, since this function is only called once per loaded module
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@ -4,32 +4,16 @@
 package runtime
-import (
+import "internal/cpu"
 	internalcpu "internal/cpu"
 	"runtime/internal/sys"
 )
 const (
 	// bit masks taken from bits/hwcap.h
 	_HWCAP_S390_VX = 2048 // vector facility
 )
 // facilities is padded to avoid false sharing.
 type facilities struct {
 	_     [sys.CacheLineSize]byte
 	hasVX bool // vector facility
 	_     [sys.CacheLineSize]byte
 }
 // cpu indicates the availability of s390x facilities that can be used in
 // Go assembly but are optional on models supported by Go.
 // TODO: remove this once we're only using internal/cpu.
 var cpu facilities
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP: // CPU capability bit flags
-		internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
+		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
 		cpu.hasVX = val&_HWCAP_S390_VX != 0
 	}
 }
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@ -932,6 +932,85 @@ func EqualFold(s, t string) bool {
 	return s == t
 }
 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
 func Index(s, substr string) int {
 	n := len(substr)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, substr[0])
 	case n == len(s):
 		if substr == s {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= bytealg.MaxLen:
 		// Use brute force when s and substr both are small
 		if len(s) <= bytealg.MaxBruteForce {
 			return bytealg.IndexString(s, substr)
 		}
 		c := substr[0]
 		i := 0
 		t := s[:len(s)-n+1]
 		fails := 0
 		for i < len(t) {
 			if t[i] != c {
 				// IndexByte is faster than bytealg.IndexString, so use it as long as
 				// we're not getting lots of false positives.
 				o := IndexByte(t[i:], c)
 				if o < 0 {
 					return -1
 				}
 				i += o
 			}
 			if s[i:i+n] == substr {
 				return i
 			}
 			fails++
 			i++
 			// Switch to bytealg.IndexString when IndexByte produces too many false positives.
 			if fails > bytealg.Cutover(i) {
 				r := bytealg.IndexString(s[i:], substr)
 				if r >= 0 {
 					return r + i
 				}
 				return -1
 			}
 		}
 		return -1
 	}
 	c := substr[0]
 	i := 0
 	t := s[:len(s)-n+1]
 	fails := 0
 	for i < len(t) {
 		if t[i] != c {
 			o := IndexByte(t[i:], c)
 			if o < 0 {
 				return -1
 			}
 			i += o
 		}
 		if s[i:i+n] == substr {
 			return i
 		}
 		i++
 		fails++
 		if fails >= 4+i>>4 && i < len(t) {
 			// See comment in ../bytes/bytes_generic.go.
 			j := indexRabinKarp(s[i:], substr)
 			if j < 0 {
 				return -1
 			}
 			return i + j
 		}
 	}
 	return -1
 }
 func indexRabinKarp(s, substr string) int {
 	// Rabin-Karp search
 	hashss, pow := hashStr(substr)
--- a/src/strings/strings_amd64.go
+++ b/src/strings/strings_amd64.go
@ -1,79 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package strings
 import "internal/cpu"
 //go:noescape
 // indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
 // indexShortStr requires 2 <= len(c) <= shortStringLen
 func indexShortStr(s, c string) int  // ../runtime/asm_amd64.s
 func countByte(s string, c byte) int // ../runtime/asm_amd64.s
 var shortStringLen int
 func init() {
 	if cpu.X86.HasAVX2 {
 		shortStringLen = 63
 	} else {
 		shortStringLen = 31
 	}
 }
 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
 func Index(s, substr string) int {
 	n := len(substr)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, substr[0])
 	case n == len(s):
 		if substr == s {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= shortStringLen:
 		// Use brute force when s and substr both are small
 		if len(s) <= 64 {
 			return indexShortStr(s, substr)
 		}
 		c := substr[0]
 		i := 0
 		t := s[:len(s)-n+1]
 		fails := 0
 		for i < len(t) {
 			if t[i] != c {
 				// IndexByte skips 16/32 bytes per iteration,
 				// so it's faster than indexShortStr.
 				o := IndexByte(t[i:], c)
 				if o < 0 {
 					return -1
 				}
 				i += o
 			}
 			if s[i:i+n] == substr {
 				return i
 			}
 			fails++
 			i++
 			// Switch to indexShortStr when IndexByte produces too many false positives.
 			// Too many means more that 1 error per 8 characters.
 			// Allow some errors in the beginning.
 			if fails > (i+16)/8 {
 				r := indexShortStr(s[i:], substr)
 				if r >= 0 {
 					return r + i
 				}
 				return -1
 			}
 		}
 		return -1
 	}
 	return indexRabinKarp(s, substr)
 }
--- a/src/strings/strings_generic.go
+++ b/src/strings/strings_generic.go
@ -1,55 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64,!s390x
 package strings
 // TODO: implements short string optimization on non amd64 platforms
 // and get rid of strings_amd64.go
 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
 func Index(s, substr string) int {
 	n := len(substr)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, substr[0])
 	case n == len(s):
 		if substr == s {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	}
 	c := substr[0]
 	i := 0
 	t := s[:len(s)-n+1]
 	fails := 0
 	for i < len(t) {
 		if t[i] != c {
 			o := IndexByte(t[i:], c)
 			if o < 0 {
 				return -1
 			}
 			i += o
 		}
 		if s[i:i+n] == substr {
 			return i
 		}
 		i++
 		fails++
 		if fails >= 4+i>>4 && i < len(t) {
 			// See comment in ../bytes/bytes_generic.go.
 			j := indexRabinKarp(s[i:], substr)
 			if j < 0 {
 				return -1
 			}
 			return i + j
 		}
 	}
 	return -1
 }
--- a/src/strings/strings_s390x.go
+++ b/src/strings/strings_s390x.go
@ -1,80 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package strings
 //go:noescape
 // indexShortStr returns the index of the first instance of sep in s,
 // or -1 if sep is not present in s.
 // indexShortStr requires 2 <= len(sep) <= shortStringLen
 func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
 // supportsVX reports whether the vector facility is available.
 // indexShortStr must not be called if the vector facility is not
 // available.
 func supportsVX() bool // ../runtime/asm_s390x.s
 var shortStringLen = -1
 func init() {
 	if supportsVX() {
 		shortStringLen = 64
 	}
 }
 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
 func Index(s, substr string) int {
 	n := len(substr)
 	switch {
 	case n == 0:
 		return 0
 	case n == 1:
 		return IndexByte(s, substr[0])
 	case n == len(s):
 		if substr == s {
 			return 0
 		}
 		return -1
 	case n > len(s):
 		return -1
 	case n <= shortStringLen:
 		// Use brute force when s and substr both are small
 		if len(s) <= 64 {
 			return indexShortStr(s, substr)
 		}
 		c := substr[0]
 		i := 0
 		t := s[:len(s)-n+1]
 		fails := 0
 		for i < len(t) {
 			if t[i] != c {
 				// IndexByte skips 16/32 bytes per iteration,
 				// so it's faster than indexShortStr.
 				o := IndexByte(t[i:], c)
 				if o < 0 {
 					return -1
 				}
 				i += o
 			}
 			if s[i:i+n] == substr {
 				return i
 			}
 			fails++
 			i++
 			// Switch to indexShortStr when IndexByte produces too many false positives.
 			// Too many means more that 1 error per 8 characters.
 			// Allow some errors in the beginning.
 			if fails > (i+16)/8 {
 				r := indexShortStr(s[i:], substr)
 				if r >= 0 {
 					return r + i
 				}
 				return -1
 			}
 		}
 		return -1
 	}
 	return indexRabinKarp(s, substr)
 }