mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
internal/bytealg: optimize index function for ppc64le/power9
Optimized index2to16 loop by unrolling the loop by 4. Multiple benchmark tests show performance improvement on POWER9. Similar improvements are seen on POWER10. Added tests to ensure changes work fine. name old time/op new time/op delta Index/10 18.3ns ± 0% 19.7ns ±25% ~ Index/32 75.3ns ± 0% 69.2ns ± 0% -8.22% Index/4K 5.53µs ± 0% 3.69µs ± 0% -33.20% Index/4M 5.64ms ± 0% 3.75ms ± 0% -33.55% Index/64M 92.9ms ± 0% 61.6ms ± 0% -33.69% IndexHard2 1.41ms ± 0% 0.93ms ± 0% -33.75% CountHard2 1.41ms ± 0% 0.93ms ± 0% -33.75% Change-Id: If9331df6a141a4716724b8cb648d2b91bdf17e5f Reviewed-on: https://go-review.googlesource.com/c/go/+/377016 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
This commit is contained in:
parent
c386269ed8
commit
7ca1e2aa56
2 changed files with 149 additions and 79 deletions
|
|
@ -140,6 +140,36 @@ var indexTests = []BinOpTest{
|
||||||
{"abc", "c", 2},
|
{"abc", "c", 2},
|
||||||
{"abc", "x", -1},
|
{"abc", "x", -1},
|
||||||
{"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
|
{"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
|
||||||
|
{"fofofofooofoboo", "oo", 7},
|
||||||
|
{"fofofofofofoboo", "ob", 11},
|
||||||
|
{"fofofofofofoboo", "boo", 12},
|
||||||
|
{"fofofofofofoboo", "oboo", 11},
|
||||||
|
{"fofofofofoooboo", "fooo", 8},
|
||||||
|
{"fofofofofofoboo", "foboo", 10},
|
||||||
|
{"fofofofofofoboo", "fofob", 8},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffof", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffof", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofo", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofo", 13},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofoo", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofoo", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofoob", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofoob", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofooba", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofooba", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofoobar", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofoobar", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofoobarf", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofoobarf", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofoobarfo", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofoobarfo", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foffofoobarfoo", 13},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "foffofoobarfoo", 12},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "ofoffofoobarfoo", 12},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "ofoffofoobarfoo", 11},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "fofoffofoobarfoo", 11},
|
||||||
|
{"fofofofofofofoffofoobarfoo", "fofoffofoobarfoo", 10},
|
||||||
|
{"fofofofofoofofoffofoobarfoo", "foobars", -1},
|
||||||
{"foofyfoobarfoobar", "y", 4},
|
{"foofyfoobarfoobar", "y", 4},
|
||||||
{"oooooooooooooooooooooo", "r", -1},
|
{"oooooooooooooooooooooo", "r", -1},
|
||||||
{"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22},
|
{"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22},
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,9 @@
|
||||||
|
|
||||||
// NOTE: There is a power9 implementation that
|
// NOTE: There is a power9 implementation that
|
||||||
// improves performance by 10-15% on little
|
// improves performance by 10-15% on little
|
||||||
// endian for some of the benchmarks, but
|
// endian for some of the benchmarks.
|
||||||
// work is still needed for a big endian
|
// Unrolled index2to16 loop by 4 on ppc64le/power9
|
||||||
|
// Work is still needed for a big endian
|
||||||
// implementation on power9.
|
// implementation on power9.
|
||||||
|
|
||||||
//go:build ppc64 || ppc64le
|
//go:build ppc64 || ppc64le
|
||||||
|
|
@ -46,8 +47,8 @@ DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
|
||||||
GLOBL byteswap<>+0(SB), RODATA, $16
|
GLOBL byteswap<>+0(SB), RODATA, $16
|
||||||
|
|
||||||
TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
|
TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
|
||||||
// R3 = byte array pointer
|
// R3 = byte array pointer
|
||||||
// R4 = length
|
// R4 = length
|
||||||
MOVD R6, R5 // R5 = separator pointer
|
MOVD R6, R5 // R5 = separator pointer
|
||||||
MOVD R7, R6 // R6 = separator length
|
MOVD R7, R6 // R6 = separator length
|
||||||
|
|
||||||
|
|
@ -97,16 +98,6 @@ power8:
|
||||||
#define LASTSTR R27
|
#define LASTSTR R27
|
||||||
#define ONES V20
|
#define ONES V20
|
||||||
#define SWAP V21
|
#define SWAP V21
|
||||||
#define V0_ VS32
|
|
||||||
#define V1_ VS33
|
|
||||||
#define V2_ VS34
|
|
||||||
#define V3_ VS35
|
|
||||||
#define V4_ VS36
|
|
||||||
#define V5_ VS37
|
|
||||||
#define V6_ VS38
|
|
||||||
#define V7_ VS39
|
|
||||||
#define V8_ VS40
|
|
||||||
#define V9_ VS41
|
|
||||||
#define SWAP_ VS53
|
#define SWAP_ VS53
|
||||||
TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
|
TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
|
||||||
CMP R6, R4 // Compare lengths
|
CMP R6, R4 // Compare lengths
|
||||||
|
|
@ -129,7 +120,7 @@ TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
|
||||||
BGE CR4, loadge16 // Load for len(sep) >= 16
|
BGE CR4, loadge16 // Load for len(sep) >= 16
|
||||||
SUB R6, R16, R9 // 16-len of sep
|
SUB R6, R16, R9 // 16-len of sep
|
||||||
SLD $3, R9 // Set up for VSLO
|
SLD $3, R9 // Set up for VSLO
|
||||||
MTVSRD R9, V9_ // Set up for VSLO
|
MTVSRD R9, V9 // Set up for VSLO
|
||||||
VSLDOI $8, V9, V9, V9 // Set up for VSLO
|
VSLDOI $8, V9, V9, V9 // Set up for VSLO
|
||||||
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
|
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
|
||||||
|
|
||||||
|
|
@ -140,9 +131,9 @@ loadge16:
|
||||||
BGT sepcross16 // Sep crosses 16 byte boundary
|
BGT sepcross16 // Sep crosses 16 byte boundary
|
||||||
|
|
||||||
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
|
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
|
||||||
VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0
|
VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
|
||||||
SLD $3, R9 // Set up shift count for VSLO
|
SLD $3, R9 // Set up shift count for VSLO
|
||||||
MTVSRD R9, V8_ // Set up shift count for VSLO
|
MTVSRD R9, V8 // Set up shift count for VSLO
|
||||||
VSLDOI $8, V8, V8, V8
|
VSLDOI $8, V8, V8, V8
|
||||||
VSLO V0, V8, V0 // Shift by start byte
|
VSLO V0, V8, V0 // Shift by start byte
|
||||||
|
|
||||||
|
|
@ -150,7 +141,7 @@ loadge16:
|
||||||
BR index2plus
|
BR index2plus
|
||||||
|
|
||||||
sepcross16:
|
sepcross16:
|
||||||
VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0
|
VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0
|
||||||
|
|
||||||
VAND V0, SEPMASK, V0 // mask out separator
|
VAND V0, SEPMASK, V0 // mask out separator
|
||||||
BLE CR4, index2to16
|
BLE CR4, index2to16
|
||||||
|
|
@ -179,10 +170,10 @@ index2plus:
|
||||||
// that value when found. Loop as
|
// that value when found. Loop as
|
||||||
// long as len(string) > 16
|
// long as len(string) > 16
|
||||||
index2loop2:
|
index2loop2:
|
||||||
VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3
|
VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
|
||||||
|
|
||||||
index2loop:
|
index2loop:
|
||||||
VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
|
VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
|
||||||
VCMPEQUH V1, V2, V5 // Search for sep
|
VCMPEQUH V1, V2, V5 // Search for sep
|
||||||
VCMPEQUH V1, V3, V6 // Search for sep offset by 1
|
VCMPEQUH V1, V3, V6 // Search for sep offset by 1
|
||||||
VSEL V6, V5, V31, V7 // merge even and odd indices
|
VSEL V6, V5, V31, V7 // merge even and odd indices
|
||||||
|
|
@ -201,7 +192,7 @@ index2loop:
|
||||||
BLT index2loop2 // If < last, continue loop
|
BLT index2loop2 // If < last, continue loop
|
||||||
CMP R7, LASTBYTE // Compare addr+16 against last byte
|
CMP R7, LASTBYTE // Compare addr+16 against last byte
|
||||||
BLT index2to16 // If < 16 handle specially
|
BLT index2to16 // If < 16 handle specially
|
||||||
VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3
|
VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
|
||||||
VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
|
VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
|
||||||
BR index2loop
|
BR index2loop
|
||||||
|
|
||||||
|
|
@ -234,11 +225,11 @@ index3plus:
|
||||||
index3loop2:
|
index3loop2:
|
||||||
MOVD $2, R21 // Set up index for 2
|
MOVD $2, R21 // Set up index for 2
|
||||||
VSPLTISB $0, V10 // Clear V10
|
VSPLTISB $0, V10 // Clear V10
|
||||||
VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3
|
VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
|
||||||
VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
|
VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
|
||||||
|
|
||||||
index3loop:
|
index3loop:
|
||||||
VLOADSWAP(R7, R0, V2, V2_) // Load with correct order
|
VLOADSWAP(R7, R0, V2, V2) // Load with correct order
|
||||||
VSLDOI $1, V2, V3, V4 // string[1:17]
|
VSLDOI $1, V2, V3, V4 // string[1:17]
|
||||||
VSLDOI $2, V2, V3, V9 // string[2:18]
|
VSLDOI $2, V2, V3, V9 // string[2:18]
|
||||||
VCMPEQUH V1, V2, V5 // compare hw even indices
|
VCMPEQUH V1, V2, V5 // compare hw even indices
|
||||||
|
|
@ -294,12 +285,12 @@ index4plus:
|
||||||
VSPLTW $0, V0, V1 // Splat 1st word of separator
|
VSPLTW $0, V0, V1 // Splat 1st word of separator
|
||||||
|
|
||||||
index4loop:
|
index4loop:
|
||||||
VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
|
VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
|
||||||
|
|
||||||
next4:
|
next4:
|
||||||
VSPLTISB $0, V10 // Clear
|
VSPLTISB $0, V10 // Clear
|
||||||
MOVD $3, R9 // Number of bytes beyond 16
|
MOVD $3, R9 // Number of bytes beyond 16
|
||||||
VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+3 into V3
|
VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3
|
||||||
VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
|
VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
|
||||||
VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
|
VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
|
||||||
VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
|
VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
|
||||||
|
|
@ -344,7 +335,7 @@ index2to16:
|
||||||
// At least 16 bytes of string left
|
// At least 16 bytes of string left
|
||||||
// Mask the number of bytes in sep
|
// Mask the number of bytes in sep
|
||||||
index2to16loop:
|
index2to16loop:
|
||||||
VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1
|
VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
|
||||||
|
|
||||||
compare:
|
compare:
|
||||||
VAND V1, SEPMASK, V2 // Mask out sep size
|
VAND V1, SEPMASK, V2 // Mask out sep size
|
||||||
|
|
@ -366,14 +357,14 @@ index2to16tail:
|
||||||
ADD R10, R9, R11 // offset + len
|
ADD R10, R9, R11 // offset + len
|
||||||
CMP R11, $16 // >= 16?
|
CMP R11, $16 // >= 16?
|
||||||
BLE short // Does not cross 16 bytes
|
BLE short // Does not cross 16 bytes
|
||||||
VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1
|
VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
|
||||||
BR index2to16next // Continue on
|
BR index2to16next // Continue on
|
||||||
|
|
||||||
short:
|
short:
|
||||||
RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
|
RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
|
||||||
VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1
|
VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
|
||||||
SLD $3, R10 // Set up shift
|
SLD $3, R10 // Set up shift
|
||||||
MTVSRD R10, V8_ // Set up shift
|
MTVSRD R10, V8 // Set up shift
|
||||||
VSLDOI $8, V8, V8, V8
|
VSLDOI $8, V8, V8, V8
|
||||||
VSLO V1, V8, V1 // Shift by start byte
|
VSLO V1, V8, V1 // Shift by start byte
|
||||||
VSPLTISB $0, V25 // Clear for later use
|
VSPLTISB $0, V25 // Clear for later use
|
||||||
|
|
@ -393,17 +384,17 @@ index17plus:
|
||||||
BGT index33plus
|
BGT index33plus
|
||||||
SUB $16, R6, R9 // Extra > 16
|
SUB $16, R6, R9 // Extra > 16
|
||||||
SLD $56, R9, R10 // Shift to use in VSLO
|
SLD $56, R9, R10 // Shift to use in VSLO
|
||||||
MTVSRD R10, V9_ // Set up for VSLO
|
MTVSRD R10, V9 // Set up for VSLO
|
||||||
VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1
|
VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
|
||||||
VSLO V1, V9, V1 // Shift left
|
VSLO V1, V9, V1 // Shift left
|
||||||
VSPLTISB $0xff, V7 // Splat 1s
|
VSPLTISB $0xff, V7 // Splat 1s
|
||||||
VSPLTISB $0, V27 // Splat 0
|
VSPLTISB $0, V27 // Splat 0
|
||||||
|
|
||||||
index17to32loop:
|
index17to32loop:
|
||||||
VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
|
VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
|
||||||
|
|
||||||
next17:
|
next17:
|
||||||
VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3
|
VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3
|
||||||
VSLO V3, V9, V3 // Shift left
|
VSLO V3, V9, V3 // Shift left
|
||||||
VCMPEQUB V0, V2, V4 // Compare first 16 bytes
|
VCMPEQUB V0, V2, V4 // Compare first 16 bytes
|
||||||
VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
|
VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
|
||||||
|
|
@ -454,7 +445,7 @@ TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
|
||||||
BGE CR4, loadge16 // Load for len(sep) >= 16
|
BGE CR4, loadge16 // Load for len(sep) >= 16
|
||||||
SUB R6, R16, R9 // 16-len of sep
|
SUB R6, R16, R9 // 16-len of sep
|
||||||
SLD $3, R9 // Set up for VSLO
|
SLD $3, R9 // Set up for VSLO
|
||||||
MTVSRD R9, V9_ // Set up for VSLO
|
MTVSRD R9, V9 // Set up for VSLO
|
||||||
VSLDOI $8, V9, V9, V9 // Set up for VSLO
|
VSLDOI $8, V9, V9, V9 // Set up for VSLO
|
||||||
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
|
VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
|
||||||
|
|
||||||
|
|
@ -465,9 +456,9 @@ loadge16:
|
||||||
BGT sepcross16 // Sep crosses 16 byte boundary
|
BGT sepcross16 // Sep crosses 16 byte boundary
|
||||||
|
|
||||||
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
|
RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
|
||||||
LXVB16X (R8)(R0), V0_ // Load 16 bytes @R8 into V0
|
LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
|
||||||
SLD $3, R9 // Set up shift count for VSLO
|
SLD $3, R9 // Set up shift count for VSLO
|
||||||
MTVSRD R9, V8_ // Set up shift count for VSLO
|
MTVSRD R9, V8 // Set up shift count for VSLO
|
||||||
VSLDOI $8, V8, V8, V8
|
VSLDOI $8, V8, V8, V8
|
||||||
VSLO V0, V8, V0 // Shift by start byte
|
VSLO V0, V8, V0 // Shift by start byte
|
||||||
|
|
||||||
|
|
@ -475,7 +466,7 @@ loadge16:
|
||||||
BR index2plus
|
BR index2plus
|
||||||
|
|
||||||
sepcross16:
|
sepcross16:
|
||||||
LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
|
LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
|
||||||
|
|
||||||
VAND V0, SEPMASK, V0 // mask out separator
|
VAND V0, SEPMASK, V0 // mask out separator
|
||||||
BLE CR4, index2to16
|
BLE CR4, index2to16
|
||||||
|
|
@ -504,10 +495,10 @@ index2plus:
|
||||||
// that value when found. Loop as
|
// that value when found. Loop as
|
||||||
// long as len(string) > 16
|
// long as len(string) > 16
|
||||||
index2loop2:
|
index2loop2:
|
||||||
LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
|
LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3
|
||||||
|
|
||||||
index2loop:
|
index2loop:
|
||||||
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
|
LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
|
||||||
VCMPEQUH V1, V2, V5 // Search for sep
|
VCMPEQUH V1, V2, V5 // Search for sep
|
||||||
VCMPEQUH V1, V3, V6 // Search for sep offset by 1
|
VCMPEQUH V1, V3, V6 // Search for sep offset by 1
|
||||||
VSEL V6, V5, V31, V7 // merge even and odd indices
|
VSEL V6, V5, V31, V7 // merge even and odd indices
|
||||||
|
|
@ -526,7 +517,7 @@ index2loop:
|
||||||
BLT index2loop2 // If < last, continue loop
|
BLT index2loop2 // If < last, continue loop
|
||||||
CMP R7, LASTBYTE // Compare addr+16 against last byte
|
CMP R7, LASTBYTE // Compare addr+16 against last byte
|
||||||
BLT index2to16 // If < 16 handle specially
|
BLT index2to16 // If < 16 handle specially
|
||||||
LXVB16X (R7)(R0), V3_ // Load 16 bytes @R7 into V3
|
LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3
|
||||||
VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
|
VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
|
||||||
BR index2loop
|
BR index2loop
|
||||||
|
|
||||||
|
|
@ -559,11 +550,11 @@ index3plus:
|
||||||
index3loop2:
|
index3loop2:
|
||||||
MOVD $2, R21 // Set up index for 2
|
MOVD $2, R21 // Set up index for 2
|
||||||
VSPLTISB $0, V10 // Clear V10
|
VSPLTISB $0, V10 // Clear V10
|
||||||
LXVB16X (R7)(R21), V3_ // Load 16 bytes @R7+2 into V3
|
LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3
|
||||||
VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
|
VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
|
||||||
|
|
||||||
index3loop:
|
index3loop:
|
||||||
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7
|
LXVB16X (R7)(R0), V2 // Load 16 bytes @R7
|
||||||
VSLDOI $1, V2, V3, V4 // string[1:17]
|
VSLDOI $1, V2, V3, V4 // string[1:17]
|
||||||
VSLDOI $2, V2, V3, V9 // string[2:18]
|
VSLDOI $2, V2, V3, V9 // string[2:18]
|
||||||
VCMPEQUH V1, V2, V5 // compare hw even indices
|
VCMPEQUH V1, V2, V5 // compare hw even indices
|
||||||
|
|
@ -604,7 +595,6 @@ index4plus:
|
||||||
ADD $20, R7, R9 // Check string size to load
|
ADD $20, R7, R9 // Check string size to load
|
||||||
CMP R9, LASTBYTE // Verify string length
|
CMP R9, LASTBYTE // Verify string length
|
||||||
BGE index2to16 // If not large enough, process remaining
|
BGE index2to16 // If not large enough, process remaining
|
||||||
MOVD $2, R15 // Set up index
|
|
||||||
|
|
||||||
// Set up masks for use with VSEL
|
// Set up masks for use with VSEL
|
||||||
MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
|
MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
|
||||||
|
|
@ -619,12 +609,12 @@ index4plus:
|
||||||
VSPLTW $0, V0, V1 // Splat 1st word of separator
|
VSPLTW $0, V0, V1 // Splat 1st word of separator
|
||||||
|
|
||||||
index4loop:
|
index4loop:
|
||||||
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
|
LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
|
||||||
|
|
||||||
next4:
|
next4:
|
||||||
VSPLTISB $0, V10 // Clear
|
VSPLTISB $0, V10 // Clear
|
||||||
MOVD $3, R9 // Number of bytes beyond 16
|
MOVD $3, R9 // Number of bytes beyond 16
|
||||||
LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7 into V2
|
LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V2
|
||||||
VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
|
VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
|
||||||
VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
|
VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
|
||||||
VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
|
VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
|
||||||
|
|
@ -662,46 +652,91 @@ index2to16:
|
||||||
CMP R7, LASTSTR // Compare last start byte
|
CMP R7, LASTSTR // Compare last start byte
|
||||||
BGT notfound // last takes len(sep) into account
|
BGT notfound // last takes len(sep) into account
|
||||||
|
|
||||||
ADD $16, R7, R9 // Check for last byte of string
|
ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes
|
||||||
CMP R9, LASTBYTE
|
CMP R9, LASTBYTE
|
||||||
BGT index2to16tail
|
BGT index2to16tail
|
||||||
|
|
||||||
// At least 16 bytes of string left
|
// At least 16 bytes of string left
|
||||||
// Mask the number of bytes in sep
|
// Mask the number of bytes in sep
|
||||||
index2to16loop:
|
VSPLTISB $0, V10 // Clear
|
||||||
LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
|
MOVD $3, R17 // Number of bytes beyond 16
|
||||||
|
|
||||||
compare:
|
index2to16loop:
|
||||||
VAND V1, SEPMASK, V2 // Mask out sep size
|
LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
|
||||||
VCMPEQUBCC V0, V2, V3 // Compare masked string
|
LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
|
||||||
BLT CR6, found // All equal
|
|
||||||
ADD $1, R7 // Update ptr to next byte
|
VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes
|
||||||
|
VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1
|
||||||
|
VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2
|
||||||
|
VAND V1, SEPMASK, V8 // Mask out sep size 0th index
|
||||||
|
VAND V3, SEPMASK, V9 // Mask out sep size 1st index
|
||||||
|
VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
|
||||||
|
VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
|
||||||
|
VCMPEQUBCC V0, V8, V8 // compare masked string
|
||||||
|
BLT CR6, found // All equal while comparing 0th index
|
||||||
|
VCMPEQUBCC V0, V9, V9 // compare masked string
|
||||||
|
BLT CR6, found2 // All equal while comparing 1st index
|
||||||
|
VCMPEQUBCC V0, V11, V11 // compare masked string
|
||||||
|
BLT CR6, found3 // All equal while comparing 2nd index
|
||||||
|
VCMPEQUBCC V0, V12, V12 // compare masked string
|
||||||
|
BLT CR6, found4 // All equal while comparing 3rd index
|
||||||
|
|
||||||
|
ADD $4, R7 // Update ptr to next 4 bytes
|
||||||
CMP R7, LASTSTR // Still less than last start byte
|
CMP R7, LASTSTR // Still less than last start byte
|
||||||
BGT notfound // Not found
|
BGT notfound // Not found
|
||||||
ADD $16, R7, R9 // Verify remaining bytes
|
ADD $19, R7, R9 // Verify remaining bytes
|
||||||
CMP R9, LASTBYTE // At least 16
|
CMP R9, LASTBYTE // length of string at least 19
|
||||||
BLT index2to16loop // Try again
|
BLE index2to16loop // Try again, else do post processing and jump to index2to16next
|
||||||
|
|
||||||
// Less than 16 bytes remaining in string
|
// <19 bytes left, post process the remaining string
|
||||||
// Separator >= 2
|
|
||||||
index2to16tail:
|
index2to16tail:
|
||||||
ADD R3, R4, R9 // End of string
|
ADD R3, R4, R9 // End of string
|
||||||
SUB R7, R9, R9 // Number of bytes left
|
SUB R7, R9, R9 // Number of bytes left
|
||||||
ANDCC $15, R7, R10 // 16 byte offset
|
ANDCC $15, R7, R10 // 16 byte offset
|
||||||
ADD R10, R9, R11 // offset + len
|
ADD R10, R9, R11 // offset + len
|
||||||
CMP R11, $16 // >= 16?
|
CMP R11, $16 // >= 16?
|
||||||
BLE short // Does not cross 16 bytes
|
BLE short // Does not cross 16 bytes
|
||||||
LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
|
LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1
|
||||||
BR index2to16next // Continue on
|
CMP R9, $16 // Post-processing of unrolled loop
|
||||||
|
BLE index2to16next // continue to index2to16next if <= 16 bytes
|
||||||
|
SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2
|
||||||
|
LXVB16X (R7)(R10), V9
|
||||||
|
CMP R10, $1 // string length is 17, compare 1 more byte
|
||||||
|
BNE extra2 // string length is 18, compare 2 more bytes
|
||||||
|
VSLDOI $15, V9, V10, V25
|
||||||
|
VAND V1, SEPMASK, V2 // Just compare size of sep
|
||||||
|
VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
|
||||||
|
BLT CR6, found // Found
|
||||||
|
ADD $1, R7 // Not found, try next partial string
|
||||||
|
CMP R7, LASTSTR // Check for end of string
|
||||||
|
BGT notfound // If at end, then not found
|
||||||
|
VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
|
||||||
|
BR index2to16next // go to remainder loop
|
||||||
|
extra2:
|
||||||
|
VSLDOI $14, V9, V10, V25
|
||||||
|
VAND V1, SEPMASK, V2 // Just compare size of sep
|
||||||
|
VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
|
||||||
|
BLT CR6, found // Found
|
||||||
|
ADD $1, R7 // Not found, try next partial string
|
||||||
|
CMP R7, LASTSTR // Check for end of string
|
||||||
|
BGT notfound // If at end, then not found
|
||||||
|
VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
|
||||||
|
VAND V1, SEPMASK, V2 // Just compare size of sep
|
||||||
|
VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
|
||||||
|
BLT CR6, found // Found
|
||||||
|
ADD $1, R7 // Not found, try next partial string
|
||||||
|
CMP R7, LASTSTR // Check for end of string
|
||||||
|
BGT notfound // If at end, then not found
|
||||||
|
VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
|
||||||
|
BR index2to16next // Check the remaining partial string in index2to16next
|
||||||
|
|
||||||
short:
|
short:
|
||||||
RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
|
RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
|
||||||
LXVB16X (R9)(R0), V1_ // Load 16 bytes @R9 into V1
|
LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1
|
||||||
SLD $3, R10 // Set up shift
|
SLD $3, R10 // Set up shift
|
||||||
MTVSRD R10, V8_ // Set up shift
|
MTVSRD R10, V8 // Set up shift
|
||||||
VSLDOI $8, V8, V8, V8
|
VSLDOI $8, V8, V8, V8
|
||||||
VSLO V1, V8, V1 // Shift by start byte
|
VSLO V1, V8, V1 // Shift by start byte
|
||||||
VSPLTISB $0, V25 // Clear for later use
|
|
||||||
|
|
||||||
index2to16next:
|
index2to16next:
|
||||||
VAND V1, SEPMASK, V2 // Just compare size of sep
|
VAND V1, SEPMASK, V2 // Just compare size of sep
|
||||||
|
|
@ -710,7 +745,7 @@ index2to16next:
|
||||||
ADD $1, R7 // Not found, try next partial string
|
ADD $1, R7 // Not found, try next partial string
|
||||||
CMP R7, LASTSTR // Check for end of string
|
CMP R7, LASTSTR // Check for end of string
|
||||||
BGT notfound // If at end, then not found
|
BGT notfound // If at end, then not found
|
||||||
VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
|
VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
|
||||||
BR index2to16next // Check the next partial string
|
BR index2to16next // Check the next partial string
|
||||||
|
|
||||||
index17plus:
|
index17plus:
|
||||||
|
|
@ -718,17 +753,17 @@ index17plus:
|
||||||
BGT index33plus
|
BGT index33plus
|
||||||
SUB $16, R6, R9 // Extra > 16
|
SUB $16, R6, R9 // Extra > 16
|
||||||
SLD $56, R9, R10 // Shift to use in VSLO
|
SLD $56, R9, R10 // Shift to use in VSLO
|
||||||
MTVSRD R10, V9_ // Set up for VSLO
|
MTVSRD R10, V9 // Set up for VSLO
|
||||||
LXVB16X (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
|
LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1
|
||||||
VSLO V1, V9, V1 // Shift left
|
VSLO V1, V9, V1 // Shift left
|
||||||
VSPLTISB $0xff, V7 // Splat 1s
|
VSPLTISB $0xff, V7 // Splat 1s
|
||||||
VSPLTISB $0, V27 // Splat 0
|
VSPLTISB $0, V27 // Splat 0
|
||||||
|
|
||||||
index17to32loop:
|
index17to32loop:
|
||||||
LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
|
LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
|
||||||
|
|
||||||
next17:
|
next17:
|
||||||
LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7+R9 into V3
|
LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3
|
||||||
VSLO V3, V9, V3 // Shift left
|
VSLO V3, V9, V3 // Shift left
|
||||||
VCMPEQUB V0, V2, V4 // Compare first 16 bytes
|
VCMPEQUB V0, V2, V4 // Compare first 16 bytes
|
||||||
VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
|
VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
|
||||||
|
|
@ -754,8 +789,13 @@ foundR25:
|
||||||
SUB R3, R7 // Subtract from start of string
|
SUB R3, R7 // Subtract from start of string
|
||||||
MOVD R7, R3 // Return byte where found
|
MOVD R7, R3 // Return byte where found
|
||||||
RET
|
RET
|
||||||
|
found4:
|
||||||
found:
|
ADD $1, R7 // found from unrolled loop at index 3
|
||||||
|
found3:
|
||||||
|
ADD $1, R7 // found from unrolled loop at index 2
|
||||||
|
found2:
|
||||||
|
ADD $1, R7 // found from unrolled loop at index 1
|
||||||
|
found: // found at index 0
|
||||||
SUB R3, R7 // Return byte where found
|
SUB R3, R7 // Return byte where found
|
||||||
MOVD R7, R3
|
MOVD R7, R3
|
||||||
RET
|
RET
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue