| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  | /*
 | 
					
						
							| 
									
										
										
										
											2023-03-04 21:42:38 +00:00
										 |  |  |  |  * Copyright (c) 2022-2023, Linus Groh <linusg@serenityos.org> | 
					
						
							| 
									
										
										
										
											2022-10-23 04:02:56 +02:00
										 |  |  |  |  * Copyright (c) 2022, networkException <networkexception@serenityos.org> | 
					
						
							| 
									
										
										
										
											2023-02-03 21:49:54 +01:00
										 |  |  |  |  * Copyright (c) 2023, Kenneth Myhra <kennethmyhra@serenityos.org> | 
					
						
							| 
									
										
										
										
											2023-02-15 13:56:37 +00:00
										 |  |  |  |  * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org> | 
					
						
							| 
									
										
										
										
											2024-10-14 10:51:15 +02:00
										 |  |  |  |  * Copyright (c) 2024, Andreas Kling <andreas@ladybird.org> | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  |  * | 
					
						
							|  |  |  |  |  * SPDX-License-Identifier: BSD-2-Clause | 
					
						
							|  |  |  |  |  */ | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-15 13:56:37 +00:00
										 |  |  |  | #include <AK/CharacterTypes.h>
 | 
					
						
							| 
									
										
										
										
											2024-10-14 10:51:15 +02:00
										 |  |  |  | #include <AK/FlyString.h>
 | 
					
						
							| 
									
										
										
										
											2024-03-14 07:11:20 -04:00
										 |  |  |  | #include <AK/GenericLexer.h>
 | 
					
						
							| 
									
										
										
										
											2023-03-04 21:42:38 +00:00
										 |  |  |  | #include <AK/String.h>
 | 
					
						
							| 
									
										
										
										
											2022-10-23 04:02:56 +02:00
										 |  |  |  | #include <AK/Utf16View.h>
 | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  | #include <AK/Utf8View.h>
 | 
					
						
							|  |  |  |  | #include <LibWeb/Infra/CharacterTypes.h>
 | 
					
						
							|  |  |  |  | #include <LibWeb/Infra/Strings.h>
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | namespace Web::Infra { | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-15 13:56:37 +00:00
										 |  |  |  | // https://infra.spec.whatwg.org/#ascii-case-insensitive
 | 
					
						
							|  |  |  |  | bool is_ascii_case_insensitive_match(StringView a, StringView b) | 
					
						
							|  |  |  |  | { | 
					
						
							|  |  |  |  |     // A string A is an ASCII case-insensitive match for a string B,
 | 
					
						
							|  |  |  |  |     // if the ASCII lowercase of A is the ASCII lowercase of B.
 | 
					
						
							| 
									
										
										
										
											2023-03-10 09:47:34 +01:00
										 |  |  |  |     return AK::StringUtils::equals_ignoring_ascii_case(a, b); | 
					
						
							| 
									
										
										
										
											2023-02-15 13:56:37 +00:00
										 |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-14 07:11:20 -04:00
										 |  |  |  | // https://infra.spec.whatwg.org/#normalize-newlines
 | 
					
						
							|  |  |  |  | String normalize_newlines(String const& string) | 
					
						
							|  |  |  |  | { | 
					
						
							|  |  |  |  |     // To normalize newlines in a string, replace every U+000D CR U+000A LF code point pair with a single U+000A LF
 | 
					
						
							|  |  |  |  |     // code point, and then replace every remaining U+000D CR code point with a U+000A LF code point.
 | 
					
						
							|  |  |  |  |     if (!string.contains('\r')) | 
					
						
							|  |  |  |  |         return string; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     StringBuilder builder; | 
					
						
							|  |  |  |  |     GenericLexer lexer { string }; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     while (!lexer.is_eof()) { | 
					
						
							|  |  |  |  |         builder.append(lexer.consume_until('\r')); | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if (lexer.peek() == '\r') { | 
					
						
							|  |  |  |  |             lexer.ignore(1 + static_cast<size_t>(lexer.peek(1) == '\n')); | 
					
						
							|  |  |  |  |             builder.append('\n'); | 
					
						
							|  |  |  |  |         } | 
					
						
							|  |  |  |  |     } | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     return MUST(builder.to_string()); | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  | // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace
 | 
					
						
							| 
									
										
										
										
											2023-03-04 21:42:38 +00:00
										 |  |  |  | ErrorOr<String> strip_and_collapse_whitespace(StringView string) | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  | { | 
					
						
							|  |  |  |  |     // Replace any sequence of one or more consecutive code points that are ASCII whitespace in the string with a single U+0020 SPACE code point.
 | 
					
						
							|  |  |  |  |     StringBuilder builder; | 
					
						
							|  |  |  |  |     for (auto code_point : Utf8View { string }) { | 
					
						
							|  |  |  |  |         if (Infra::is_ascii_whitespace(code_point)) { | 
					
						
							|  |  |  |  |             if (!builder.string_view().ends_with(' ')) | 
					
						
							|  |  |  |  |                 builder.append(' '); | 
					
						
							|  |  |  |  |             continue; | 
					
						
							|  |  |  |  |         } | 
					
						
							| 
									
										
										
										
											2023-03-04 21:42:38 +00:00
										 |  |  |  |         TRY(builder.try_append_code_point(code_point)); | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  |     } | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     // ...and then remove any leading and trailing ASCII whitespace from that string.
 | 
					
						
							| 
									
										
										
										
											2023-03-04 21:42:38 +00:00
										 |  |  |  |     return String::from_utf8(builder.string_view().trim(Infra::ASCII_WHITESPACE)); | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-23 04:02:56 +02:00
										 |  |  |  | // https://infra.spec.whatwg.org/#code-unit-prefix
 | 
					
						
							|  |  |  |  | bool is_code_unit_prefix(StringView potential_prefix, StringView input) | 
					
						
							|  |  |  |  | { | 
					
						
							| 
									
										
										
										
											2023-01-06 13:19:34 -05:00
										 |  |  |  |     auto potential_prefix_utf16 = utf8_to_utf16(potential_prefix).release_value_but_fixme_should_propagate_errors(); | 
					
						
							|  |  |  |  |     auto input_utf16 = utf8_to_utf16(input).release_value_but_fixme_should_propagate_errors(); | 
					
						
							| 
									
										
										
										
											2022-10-23 04:02:56 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     // 1. Let i be 0.
 | 
					
						
							|  |  |  |  |     size_t i = 0; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     // 2. While true:
 | 
					
						
							|  |  |  |  |     while (true) { | 
					
						
							|  |  |  |  |         // 1. If i is greater than or equal to potentialPrefix’s length, then return true.
 | 
					
						
							|  |  |  |  |         if (i >= potential_prefix.length()) | 
					
						
							|  |  |  |  |             return true; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         // 2. If i is greater than or equal to input’s length, then return false.
 | 
					
						
							|  |  |  |  |         if (i >= input.length()) | 
					
						
							|  |  |  |  |             return false; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         // 3. Let potentialPrefixCodeUnit be the ith code unit of potentialPrefix.
 | 
					
						
							|  |  |  |  |         auto potential_prefix_code_unit = Utf16View(potential_prefix_utf16).code_unit_at(i); | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         // 4. Let inputCodeUnit be the ith code unit of input.
 | 
					
						
							|  |  |  |  |         auto input_code_unit = Utf16View(input_utf16).code_unit_at(i); | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         // 5. Return false if potentialPrefixCodeUnit is not inputCodeUnit.
 | 
					
						
							|  |  |  |  |         if (potential_prefix_code_unit != input_code_unit) | 
					
						
							|  |  |  |  |             return false; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         // 6. Set i to i + 1.
 | 
					
						
							|  |  |  |  |         ++i; | 
					
						
							|  |  |  |  |     } | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-03 21:49:54 +01:00
										 |  |  |  | // https://infra.spec.whatwg.org/#scalar-value-string
 | 
					
						
							|  |  |  |  | ErrorOr<String> convert_to_scalar_value_string(StringView string) | 
					
						
							|  |  |  |  | { | 
					
						
							|  |  |  |  |     // To convert a string into a scalar value string, replace any surrogates with U+FFFD.
 | 
					
						
							|  |  |  |  |     StringBuilder scalar_value_builder; | 
					
						
							|  |  |  |  |     auto utf8_view = Utf8View { string }; | 
					
						
							|  |  |  |  |     for (u32 code_point : utf8_view) { | 
					
						
							|  |  |  |  |         if (is_unicode_surrogate(code_point)) | 
					
						
							|  |  |  |  |             code_point = 0xFFFD; | 
					
						
							| 
									
										
										
										
											2023-12-03 23:24:48 +01:00
										 |  |  |  |         scalar_value_builder.append_code_point(code_point); | 
					
						
							| 
									
										
										
										
											2023-02-03 21:49:54 +01:00
										 |  |  |  |     } | 
					
						
							|  |  |  |  |     return scalar_value_builder.to_string(); | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-25 23:23:26 +01:00
										 |  |  |  | // https://infra.spec.whatwg.org/#ascii-lowercase
 | 
					
						
							| 
									
										
										
										
											2023-03-04 22:41:57 +00:00
										 |  |  |  | ErrorOr<String> to_ascii_lowercase(StringView string) | 
					
						
							| 
									
										
										
										
											2023-02-25 23:23:26 +01:00
										 |  |  |  | { | 
					
						
							|  |  |  |  |     // To ASCII lowercase a string, replace all ASCII upper alphas in the string with their
 | 
					
						
							|  |  |  |  |     // corresponding code point in ASCII lower alpha.
 | 
					
						
							|  |  |  |  |     StringBuilder string_builder; | 
					
						
							|  |  |  |  |     auto utf8_view = Utf8View { string }; | 
					
						
							|  |  |  |  |     for (u32 code_point : utf8_view) { | 
					
						
							| 
									
										
										
										
											2023-03-04 22:41:57 +00:00
										 |  |  |  |         code_point = AK::to_ascii_lowercase(code_point); | 
					
						
							| 
									
										
										
										
											2023-12-03 23:20:33 +01:00
										 |  |  |  |         string_builder.append_code_point(code_point); | 
					
						
							| 
									
										
										
										
											2023-02-25 23:23:26 +01:00
										 |  |  |  |     } | 
					
						
							|  |  |  |  |     return string_builder.to_string(); | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-25 23:47:54 +01:00
										 |  |  |  | // https://infra.spec.whatwg.org/#ascii-uppercase
 | 
					
						
							| 
									
										
										
										
											2023-03-04 22:41:57 +00:00
										 |  |  |  | ErrorOr<String> to_ascii_uppercase(StringView string) | 
					
						
							| 
									
										
										
										
											2023-02-25 23:47:54 +01:00
										 |  |  |  | { | 
					
						
							|  |  |  |  |     // To ASCII uppercase a string, replace all ASCII lower alphas in the string with their
 | 
					
						
							|  |  |  |  |     // corresponding code point in ASCII upper alpha.
 | 
					
						
							|  |  |  |  |     StringBuilder string_builder; | 
					
						
							|  |  |  |  |     auto utf8_view = Utf8View { string }; | 
					
						
							|  |  |  |  |     for (u32 code_point : utf8_view) { | 
					
						
							| 
									
										
										
										
											2023-03-04 22:41:57 +00:00
										 |  |  |  |         code_point = AK::to_ascii_uppercase(code_point); | 
					
						
							| 
									
										
										
										
											2023-12-03 23:21:32 +01:00
										 |  |  |  |         string_builder.append_code_point(code_point); | 
					
						
							| 
									
										
										
										
											2023-02-25 23:47:54 +01:00
										 |  |  |  |     } | 
					
						
							|  |  |  |  |     return string_builder.to_string(); | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-21 14:45:35 +11:00
										 |  |  |  | // https://infra.spec.whatwg.org/#isomorphic-encode
 | 
					
						
							|  |  |  |  | ByteBuffer isomorphic_encode(StringView input) | 
					
						
							|  |  |  |  | { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:26:41 -08:00
										 |  |  |  |     // To isomorphic encode an isomorphic string input: return a byte sequence whose length is equal to input’s code
 | 
					
						
							|  |  |  |  |     // point length and whose bytes have the same values as the values of input’s code points, in the same order.
 | 
					
						
							|  |  |  |  |     // NOTE: This is essentially spec-speak for "Encode as ISO-8859-1 / Latin-1".
 | 
					
						
							| 
									
										
										
										
											2024-10-21 14:45:35 +11:00
										 |  |  |  |     ByteBuffer buf = {}; | 
					
						
							|  |  |  |  |     for (auto code_point : Utf8View { input }) { | 
					
						
							| 
									
										
										
										
											2025-01-03 09:22:34 +11:00
										 |  |  |  |         // VERIFY(code_point <= 0xFF);
 | 
					
						
							|  |  |  |  |         if (code_point > 0xFF) | 
					
						
							|  |  |  |  |             dbgln("FIXME: Trying to isomorphic encode a string with code points > U+00FF."); | 
					
						
							| 
									
										
										
										
											2024-10-21 14:45:35 +11:00
										 |  |  |  |         buf.append((u8)code_point); | 
					
						
							|  |  |  |  |     } | 
					
						
							|  |  |  |  |     return buf; | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | // https://infra.spec.whatwg.org/#isomorphic-decode
 | 
					
						
							|  |  |  |  | String isomorphic_decode(ReadonlyBytes input) | 
					
						
							|  |  |  |  | { | 
					
						
							| 
									
										
										
										
											2024-12-12 10:26:41 -08:00
										 |  |  |  |     // To isomorphic decode a byte sequence input, return a string whose code point length is equal
 | 
					
						
							|  |  |  |  |     // to input’s length and whose code points have the same values as the values of input’s bytes, in the same order.
 | 
					
						
							|  |  |  |  |     // NOTE: This is essentially spec-speak for "Decode as ISO-8859-1 / Latin-1".
 | 
					
						
							| 
									
										
										
										
											2024-10-21 14:45:35 +11:00
										 |  |  |  |     StringBuilder builder(input.size()); | 
					
						
							|  |  |  |  |     for (u8 code_point : input) { | 
					
						
							|  |  |  |  |         builder.append_code_point(code_point); | 
					
						
							|  |  |  |  |     } | 
					
						
							|  |  |  |  |     return builder.to_string_without_validation(); | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-11 10:47:16 +01:00
										 |  |  |  | // https://infra.spec.whatwg.org/#code-unit-less-than
 | 
					
						
							|  |  |  |  | bool code_unit_less_than(StringView a, StringView b) | 
					
						
							|  |  |  |  | { | 
					
						
							|  |  |  |  |     // 1. If b is a code unit prefix of a, then return false.
 | 
					
						
							|  |  |  |  |     if (is_code_unit_prefix(b, a)) | 
					
						
							|  |  |  |  |         return false; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     // 2. If a is a code unit prefix of b, then return true.
 | 
					
						
							|  |  |  |  |     if (is_code_unit_prefix(a, b)) | 
					
						
							|  |  |  |  |         return true; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     auto code_units_a = MUST(utf8_to_utf16(a)); | 
					
						
							|  |  |  |  |     auto code_units_b = MUST(utf8_to_utf16(b)); | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     auto view_a = Utf16View(code_units_a); | 
					
						
							|  |  |  |  |     auto view_b = Utf16View(code_units_b); | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     // 3. Let n be the smallest index such that the nth code unit of a is different from the nth code unit of b.
 | 
					
						
							|  |  |  |  |     //    (There has to be such an index, since neither string is a prefix of the other.)
 | 
					
						
							|  |  |  |  |     size_t n = 0; | 
					
						
							|  |  |  |  |     size_t min_length = min(view_a.length_in_code_units(), view_b.length_in_code_units()); | 
					
						
							|  |  |  |  |     while (n < min_length && view_a.code_unit_at(n) == view_b.code_unit_at(n)) | 
					
						
							|  |  |  |  |         ++n; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     // 4. If the nth code unit of a is less than the nth code unit of b, then return true.
 | 
					
						
							|  |  |  |  |     if (view_a.code_unit_at(n) < view_b.code_unit_at(n)) | 
					
						
							|  |  |  |  |         return true; | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     // 5. Return false.
 | 
					
						
							|  |  |  |  |     return false; | 
					
						
							|  |  |  |  | } | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-01 18:39:40 +01:00
										 |  |  |  | } |