strings, bytes: add ToValidUTF8

The newly added functions create a copy of their input with all bytes in
invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence
given as replacement parameter.

Fixes #25805

Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0
Reviewed-on: https://go-review.googlesource.com/c/go/+/142003
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
Martin Möhrmann 2018-10-13 22:40:23 +02:00 committed by Brad Fitzpatrick
parent 07f689420a
commit 3259bc4419
4 changed files with 179 additions and 0 deletions

View file

@ -631,6 +631,56 @@ func ToTitleSpecial(c unicode.SpecialCase, s string) string {
return Map(c.ToTitle, s)
}
// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences
// replaced by the replacement string, which may be empty.
func ToValidUTF8(s, replacement string) string {
var b Builder
for i, c := range s {
if c != utf8.RuneError {
continue
}
_, wid := utf8.DecodeRuneInString(s[i:])
if wid == 1 {
b.Grow(len(s) + len(replacement))
b.WriteString(s[:i])
s = s[i:]
break
}
}
// Fast path for unchanged input
if b.Cap() == 0 { // didn't call b.Grow above
return s
}
invalid := false // previous byte was from an invalid UTF-8 sequence
for i := 0; i < len(s); {
c := s[i]
if c < utf8.RuneSelf {
i++
invalid = false
b.WriteByte(c)
continue
}
_, wid := utf8.DecodeRuneInString(s[i:])
if wid == 1 {
i++
if !invalid {
invalid = true
b.WriteString(replacement)
}
continue
}
invalid = false
b.WriteString(s[i : i+wid])
i += wid
}
return b.String()
}
// isSeparator reports whether the rune could mark a word boundary.
// TODO: update when package unicode captures more of the properties.
func isSeparator(r rune) bool {