strings, bytes: add ToValidUTF8

The newly added functions create a copy of their input with all bytes in
invalid UTF-8 byte sequences mapped to the UTF-8 byte sequence
given as replacement parameter.

Fixes #25805

Change-Id: Iaf65f65b40c0581c6bb000f1590408d6628321d0
Reviewed-on: https://go-review.googlesource.com/c/go/+/142003
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
Martin Möhrmann 2018-10-13 22:40:23 +02:00 committed by Brad Fitzpatrick
parent 07f689420a
commit 3259bc4419
4 changed files with 179 additions and 0 deletions

View file

@ -1061,6 +1061,36 @@ func BenchmarkToLower(b *testing.B) {
}
}
var toValidUTF8Tests = []struct {
in string
repl string
out string
}{
{"", "\uFFFD", ""},
{"abc", "\uFFFD", "abc"},
{"\uFDDD", "\uFFFD", "\uFDDD"},
{"a\xffb", "\uFFFD", "a\uFFFDb"},
{"a\xffb\uFFFD", "X", "aXb\uFFFD"},
{"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"},
{"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"},
{"\xC0\xAF", "\uFFFD", "\uFFFD"},
{"\xE0\x80\xAF", "\uFFFD", "\uFFFD"},
{"\xed\xa0\x80", "abc", "abc"},
{"\xed\xbf\xbf", "\uFFFD", "\uFFFD"},
{"\xF0\x80\x80\xaf", "☺", "☺"},
{"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
{"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
}
func TestToValidUTF8(t *testing.T) {
for _, tc := range toValidUTF8Tests {
got := ToValidUTF8([]byte(tc.in), []byte(tc.repl))
if !Equal(got, []byte(tc.out)) {
t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out)
}
}
}
func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) }
type RepeatTest struct {
@ -1703,6 +1733,26 @@ func BenchmarkTrimSpace(b *testing.B) {
}
}
func BenchmarkToValidUTF8(b *testing.B) {
tests := []struct {
name string
input []byte
}{
{"Valid", []byte("typical")},
{"InvalidASCII", []byte("foo\xffbar")},
{"InvalidNonASCII", []byte("日本語\xff日本語")},
}
replacement := []byte("\uFFFD")
b.ResetTimer()
for _, test := range tests {
b.Run(test.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
ToValidUTF8(test.input, replacement)
}
})
}
}
func makeBenchInputHard() []byte {
tokens := [...]string{
"<a>", "<p>", "<b>", "<strong>",