mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
strings, bytes: add ToValidUTF8
The newly added functions create a copy of their input with all bytes in invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence given as replacement parameter. Fixes #25805 Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/142003 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
parent
07f689420a
commit
3259bc4419
4 changed files with 179 additions and 0 deletions
|
|
@ -631,6 +631,56 @@ func ToTitleSpecial(c unicode.SpecialCase, s string) string {
|
|||
return Map(c.ToTitle, s)
|
||||
}
|
||||
|
||||
// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences
|
||||
// replaced by the replacement string, which may be empty.
|
||||
func ToValidUTF8(s, replacement string) string {
|
||||
var b Builder
|
||||
|
||||
for i, c := range s {
|
||||
if c != utf8.RuneError {
|
||||
continue
|
||||
}
|
||||
|
||||
_, wid := utf8.DecodeRuneInString(s[i:])
|
||||
if wid == 1 {
|
||||
b.Grow(len(s) + len(replacement))
|
||||
b.WriteString(s[:i])
|
||||
s = s[i:]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Fast path for unchanged input
|
||||
if b.Cap() == 0 { // didn't call b.Grow above
|
||||
return s
|
||||
}
|
||||
|
||||
invalid := false // previous byte was from an invalid UTF-8 sequence
|
||||
for i := 0; i < len(s); {
|
||||
c := s[i]
|
||||
if c < utf8.RuneSelf {
|
||||
i++
|
||||
invalid = false
|
||||
b.WriteByte(c)
|
||||
continue
|
||||
}
|
||||
_, wid := utf8.DecodeRuneInString(s[i:])
|
||||
if wid == 1 {
|
||||
i++
|
||||
if !invalid {
|
||||
invalid = true
|
||||
b.WriteString(replacement)
|
||||
}
|
||||
continue
|
||||
}
|
||||
invalid = false
|
||||
b.WriteString(s[i : i+wid])
|
||||
i += wid
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// isSeparator reports whether the rune could mark a word boundary.
|
||||
// TODO: update when package unicode captures more of the properties.
|
||||
func isSeparator(r rune) bool {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue