mirror of
				https://github.com/LadybirdBrowser/ladybird.git
				synced 2025-10-20 16:13:21 +00:00 
			
		
		
		
	
		
			
	
	
		
			441 lines
		
	
	
	
		
			18 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
		
		
			
		
	
	
			441 lines
		
	
	
	
		
			18 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
|   | /*
 | |||
|  |  * Copyright (c) 2025, Shannon Booth <shannon@serenityos.org> | |||
|  |  * | |||
|  |  * SPDX-License-Identifier: BSD-2-Clause | |||
|  |  */ | |||
|  | 
 | |||
|  | #include <AK/Debug.h>
 | |||
|  | #include <LibURL/Pattern/Tokenizer.h>
 | |||
|  | #include <LibUnicode/CharacterTypes.h>
 | |||
|  | 
 | |||
|  | namespace URL::Pattern { | |||
|  | 
 | |||
|  | StringView Token::type_to_string(Token::Type type) | |||
|  | { | |||
|  |     switch (type) { | |||
|  |     case Token::Type::Open: | |||
|  |         return "Open"sv; | |||
|  |     case Token::Type::Close: | |||
|  |         return "Close"sv; | |||
|  |     case Token::Type::Regexp: | |||
|  |         return "Regexp"sv; | |||
|  |     case Token::Type::Name: | |||
|  |         return "Name"sv; | |||
|  |     case Token::Type::Char: | |||
|  |         return "Char"sv; | |||
|  |     case Token::Type::EscapedChar: | |||
|  |         return "EscapedChar"sv; | |||
|  |     case Token::Type::OtherModifier: | |||
|  |         return "OtherModifier"sv; | |||
|  |     case Token::Type::Asterisk: | |||
|  |         return "Asterisk"sv; | |||
|  |     case Token::Type::End: | |||
|  |         return "End"sv; | |||
|  |     case Token::Type::InvalidChar: | |||
|  |         return "InvalidChar"sv; | |||
|  |     } | |||
|  |     VERIFY_NOT_REACHED(); | |||
|  | } | |||
|  | 
 | |||
|  | String Token::to_string() const | |||
|  | { | |||
|  |     return MUST(String::formatted("{}, index: {}, value: '{}'", type_to_string(type), index, value)); | |||
|  | } | |||
|  | 
 | |||
|  | Tokenizer::Tokenizer(Utf8View const& input, Policy policy) | |||
|  |     : m_input(input) | |||
|  |     , m_policy(policy) | |||
|  | { | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#tokenize
 | |||
|  | PatternErrorOr<Vector<Token>> Tokenizer::tokenize(Utf8View const& input, Tokenizer::Policy policy) | |||
|  | { | |||
|  |     dbgln_if(URL_PATTERN_DEBUG, "URLPattern tokenizing input: '{}'", input.as_string()); | |||
|  |     VERIFY(input.validate()); | |||
|  | 
 | |||
|  |     // 1. Let tokenizer be a new tokenizer.
 | |||
|  |     // 2. Set tokenizer’s input to input.
 | |||
|  |     // 3. Set tokenizer’s policy to policy.
 | |||
|  |     Tokenizer tokenizer { input, policy }; | |||
|  | 
 | |||
|  |     // 4. While tokenizer’s index is less than tokenizer’s input's code point length:
 | |||
|  |     while (tokenizer.m_index < tokenizer.m_input.length()) { | |||
|  |         // 1. Run seek and get the next code point given tokenizer and tokenizer’s index.
 | |||
|  |         tokenizer.seek_and_get_the_next_code_point(tokenizer.m_index); | |||
|  | 
 | |||
|  |         // 2. If tokenizer’s code point is U+002A (*):
 | |||
|  |         if (tokenizer.m_code_point == '*') { | |||
|  |             // 1. Run add a token with default position and length given tokenizer and "asterisk".
 | |||
|  |             tokenizer.add_a_token_with_default_position_and_length(Token::Type::Asterisk); | |||
|  | 
 | |||
|  |             // 2. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 3. If tokenizer’s code point is U+002B (+) or U+003F (?):
 | |||
|  |         if (tokenizer.m_code_point == '+' || tokenizer.m_code_point == '?') { | |||
|  |             // 1. Run add a token with default position and length given tokenizer and "other-modifier".
 | |||
|  |             tokenizer.add_a_token_with_default_position_and_length(Token::Type::OtherModifier); | |||
|  | 
 | |||
|  |             // 2. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 4. If tokenizer’s code point is U+005C (\):
 | |||
|  |         if (tokenizer.m_code_point == '\\') { | |||
|  |             // 1. If tokenizer’s index is equal to tokenizer’s input's code point length − 1:
 | |||
|  |             if (tokenizer.m_index == tokenizer.m_input.length() - 1) { | |||
|  |                 // 1. Run process a tokenizing error given tokenizer, tokenizer’s next index, and tokenizer’s index.
 | |||
|  |                 TRY(tokenizer.process_a_tokenizing_error(tokenizer.m_next_index, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                 // 2. Continue.
 | |||
|  |                 continue; | |||
|  |             } | |||
|  | 
 | |||
|  |             // 2. Let escaped index be tokenizer’s next index.
 | |||
|  |             auto escaped_index = tokenizer.m_next_index; | |||
|  | 
 | |||
|  |             // 3. Run get the next code point given tokenizer.
 | |||
|  |             tokenizer.get_the_next_code_point(); | |||
|  | 
 | |||
|  |             // 4. Run add a token with default length given tokenizer, "escaped-char", tokenizer’s next index, and escaped index.
 | |||
|  |             tokenizer.add_a_token_with_default_length(Token::Type::EscapedChar, tokenizer.m_next_index, escaped_index); | |||
|  | 
 | |||
|  |             // 5. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 5. If tokenizer’s code point is U+007B ({):
 | |||
|  |         if (tokenizer.m_code_point == '{') { | |||
|  |             // 1. Run add a token with default position and length given tokenizer and "open".
 | |||
|  |             tokenizer.add_a_token_with_default_position_and_length(Token::Type::Open); | |||
|  | 
 | |||
|  |             // 2. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 6. If tokenizer’s code point is U+007D (}):
 | |||
|  |         if (tokenizer.m_code_point == '}') { | |||
|  |             // 1. Run add a token with default position and length given tokenizer and "close".
 | |||
|  |             tokenizer.add_a_token_with_default_position_and_length(Token::Type::Close); | |||
|  | 
 | |||
|  |             // 2. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 1. If tokenizer’s code point is U+003A (:):
 | |||
|  |         if (tokenizer.m_code_point == ':') { | |||
|  |             // 1. Let name position be tokenizer’s next index.
 | |||
|  |             auto name_position = tokenizer.m_next_index; | |||
|  | 
 | |||
|  |             // 2. Let name start be name position.
 | |||
|  |             auto name_start = name_position; | |||
|  | 
 | |||
|  |             // 3. While name position is less than tokenizer’s input's code point length:
 | |||
|  |             while (name_position < tokenizer.m_input.length()) { | |||
|  |                 // 1. Run seek and get the next code point given tokenizer and name position.
 | |||
|  |                 tokenizer.seek_and_get_the_next_code_point(name_position); | |||
|  | 
 | |||
|  |                 // 2. Let first code point be true if name position equals name start and false otherwise.
 | |||
|  |                 bool first_code_point = name_position == name_start; | |||
|  | 
 | |||
|  |                 // 3. Let valid code point be the result of running is a valid name code point given tokenizer’s code point and first code point.
 | |||
|  |                 bool valid_code_point = is_a_valid_name_code_point(tokenizer.m_code_point, first_code_point); | |||
|  | 
 | |||
|  |                 // 4. If valid code point is false break.
 | |||
|  |                 if (!valid_code_point) | |||
|  |                     break; | |||
|  | 
 | |||
|  |                 // 5. Set name position to tokenizer’s next index.
 | |||
|  |                 name_position = tokenizer.m_next_index; | |||
|  |             } | |||
|  | 
 | |||
|  |             // 4. If name position is less than or equal to name start:
 | |||
|  |             if (name_position <= name_start) { | |||
|  |                 // 1. Run process a tokenizing error given tokenizer, name start, and tokenizer’s index.
 | |||
|  |                 TRY(tokenizer.process_a_tokenizing_error(name_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                 // 2. Continue.
 | |||
|  |                 continue; | |||
|  |             } | |||
|  | 
 | |||
|  |             // 5. Run add a token with default length given tokenizer, "name", name position, and name start.
 | |||
|  |             tokenizer.add_a_token_with_default_length(Token::Type::Name, name_position, name_start); | |||
|  | 
 | |||
|  |             // 6. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 8. If tokenizer’s code point is U+0028 (():
 | |||
|  |         if (tokenizer.m_code_point == '(') { | |||
|  |             // 1. Let depth be 1.
 | |||
|  |             u32 depth = 1; | |||
|  | 
 | |||
|  |             // 2. Let regexp position be tokenizer’s next index.
 | |||
|  |             auto regexp_position = tokenizer.m_next_index; | |||
|  | 
 | |||
|  |             // 3. Let regexp start be regexp position.
 | |||
|  |             auto regexp_start = regexp_position; | |||
|  | 
 | |||
|  |             // 4. Let error be false.
 | |||
|  |             bool error = false; | |||
|  | 
 | |||
|  |             // 5. While regexp position is less than tokenizer’s input's code point length:
 | |||
|  |             while (regexp_position < tokenizer.m_input.length()) { | |||
|  |                 // 1. Run seek and get the next code point given tokenizer and regexp position.
 | |||
|  |                 tokenizer.seek_and_get_the_next_code_point(regexp_position); | |||
|  | 
 | |||
|  |                 // 2. If the result of running is ASCII given tokenizer’s code point is false:
 | |||
|  |                 if (!is_ascii(tokenizer.m_code_point)) { | |||
|  |                     // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                     TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                     // 2. Set error to true.
 | |||
|  |                     error = true; | |||
|  | 
 | |||
|  |                     // 3. Break.
 | |||
|  |                     break; | |||
|  |                 } | |||
|  | 
 | |||
|  |                 // 3. If regexp position equals regexp start and tokenizer’s code point is U+003F (?):
 | |||
|  |                 if (regexp_position == regexp_start && tokenizer.m_code_point == '?') { | |||
|  |                     // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                     TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                     // 2. Set error to true.
 | |||
|  |                     error = true; | |||
|  | 
 | |||
|  |                     // 3. Break.
 | |||
|  |                     break; | |||
|  |                 } | |||
|  | 
 | |||
|  |                 // 4. If tokenizer’s code point is U+005C (\):
 | |||
|  |                 if (tokenizer.m_code_point == '\\') { | |||
|  |                     // 1. If regexp position equals tokenizer’s input's code point length − 1:
 | |||
|  |                     if (regexp_position == tokenizer.m_input.length() - 1) { | |||
|  |                         // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                         TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                         // 2. Set error to true.
 | |||
|  |                         error = true; | |||
|  | 
 | |||
|  |                         // 3. Break
 | |||
|  |                         break; | |||
|  |                     } | |||
|  | 
 | |||
|  |                     // 2. Run get the next code point given tokenizer.
 | |||
|  |                     tokenizer.get_the_next_code_point(); | |||
|  | 
 | |||
|  |                     // 3. If the result of running is ASCII given tokenizer’s code point is false:
 | |||
|  |                     if (!is_ascii(tokenizer.m_code_point)) { | |||
|  |                         // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                         TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                         // 2. Set error to true.
 | |||
|  |                         error = true; | |||
|  | 
 | |||
|  |                         // 3. Break.
 | |||
|  |                         break; | |||
|  |                     } | |||
|  | 
 | |||
|  |                     // 4. Set regexp position to tokenizer’s next index.
 | |||
|  |                     regexp_position = tokenizer.m_next_index; | |||
|  | 
 | |||
|  |                     // 5. Continue.
 | |||
|  |                     continue; | |||
|  |                 } | |||
|  | 
 | |||
|  |                 // 5. If tokenizer’s code point is U+0029 ()):
 | |||
|  |                 if (tokenizer.m_code_point == ')') { | |||
|  |                     // 1. Decrement depth by 1.
 | |||
|  |                     --depth; | |||
|  | 
 | |||
|  |                     // 1. If depth is 0:
 | |||
|  |                     if (depth == 0) { | |||
|  |                         // 1. Set regexp position to tokenizer’s next index.
 | |||
|  |                         regexp_position = tokenizer.m_next_index; | |||
|  | 
 | |||
|  |                         // 2. Break.
 | |||
|  |                         break; | |||
|  |                     } | |||
|  |                 } | |||
|  |                 // 6. Otherwise if tokenizer’s code point is U+0028 (():
 | |||
|  |                 else if (tokenizer.m_code_point == '(') { | |||
|  |                     // 1. Increment depth by 1.
 | |||
|  |                     ++depth; | |||
|  | 
 | |||
|  |                     // 2. If regexp position equals tokenizer’s input's code point length − 1:
 | |||
|  |                     if (regexp_position == tokenizer.m_input.length() - 1) { | |||
|  |                         // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                         TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                         // 2. Set error to true.
 | |||
|  |                         error = true; | |||
|  | 
 | |||
|  |                         // 3. Break
 | |||
|  |                         break; | |||
|  |                     } | |||
|  | 
 | |||
|  |                     // 3. Let temporary position be tokenizer’s next index.
 | |||
|  |                     auto temporary_position = tokenizer.m_next_index; | |||
|  | 
 | |||
|  |                     // 4. Run get the next code point given tokenizer.
 | |||
|  |                     tokenizer.get_the_next_code_point(); | |||
|  | 
 | |||
|  |                     // 5. If tokenizer’s code point is not U+003F (?):
 | |||
|  |                     if (tokenizer.m_code_point != '?') { | |||
|  |                         // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                         TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                         // 2. Set error to true.
 | |||
|  |                         error = true; | |||
|  | 
 | |||
|  |                         // 3. Break.
 | |||
|  |                         break; | |||
|  |                     } | |||
|  | 
 | |||
|  |                     // 6. Set tokenizer’s next index to temporary position.
 | |||
|  |                     tokenizer.m_next_index = temporary_position; | |||
|  |                 } | |||
|  | 
 | |||
|  |                 // 7. Set regexp position to tokenizer’s next index.
 | |||
|  |                 regexp_position = tokenizer.m_next_index; | |||
|  |             } | |||
|  | 
 | |||
|  |             // 6. If error is true continue.
 | |||
|  |             if (error) | |||
|  |                 continue; | |||
|  | 
 | |||
|  |             // 7. If depth is not zero:
 | |||
|  |             if (depth != 0) { | |||
|  |                 // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                 TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                 // 2. Continue.
 | |||
|  |                 continue; | |||
|  |             } | |||
|  | 
 | |||
|  |             // 8. Let regexp length be regexp position − regexp start − 1.
 | |||
|  |             auto regexp_length = regexp_position - regexp_start - 1; | |||
|  | 
 | |||
|  |             // 9. If regexp length is zero:
 | |||
|  |             if (regexp_length == 0) { | |||
|  |                 // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
 | |||
|  |                 TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index)); | |||
|  | 
 | |||
|  |                 // 2. Continue.
 | |||
|  |                 continue; | |||
|  |             } | |||
|  | 
 | |||
|  |             // 10. Run add a token given tokenizer, "regexp", regexp position, regexp start, and regexp length.
 | |||
|  |             tokenizer.add_a_token(Token::Type::Regexp, regexp_position, regexp_start, regexp_length); | |||
|  | 
 | |||
|  |             // 11. Continue.
 | |||
|  |             continue; | |||
|  |         } | |||
|  | 
 | |||
|  |         // 9. Run add a token with default position and length given tokenizer and "char".
 | |||
|  |         tokenizer.add_a_token_with_default_position_and_length(Token::Type::Char); | |||
|  |     } | |||
|  | 
 | |||
|  |     // 5. Run add a token with default length given tokenizer, "end", tokenizer’s index, and tokenizer’s index.
 | |||
|  |     tokenizer.add_a_token_with_default_length(Token::Type::End, tokenizer.m_index, tokenizer.m_index); | |||
|  | 
 | |||
|  |     // 6. Return tokenizer’s token list.
 | |||
|  |     if constexpr (URL_PATTERN_DEBUG) { | |||
|  |         for (auto const& token : tokenizer.m_token_list) | |||
|  |             dbgln("{}", token.to_string()); | |||
|  |     } | |||
|  | 
 | |||
|  |     return tokenizer.m_token_list; | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#get-the-next-code-point
 | |||
|  | void Tokenizer::get_the_next_code_point() | |||
|  | { | |||
|  |     // 1. Set tokenizer’s code point to the Unicode code point in tokenizer’s input at the position indicated by tokenizer’s next index.
 | |||
|  |     m_code_point = *m_input.unicode_substring_view(m_next_index, 1).begin(); | |||
|  | 
 | |||
|  |     // 2. Increment tokenizer’s next index by 1.
 | |||
|  |     ++m_next_index; | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
 | |||
|  | void Tokenizer::seek_and_get_the_next_code_point(u32 index) | |||
|  | { | |||
|  |     // 1. Set tokenizer’s next index to index.
 | |||
|  |     m_next_index = index; | |||
|  | 
 | |||
|  |     // 2. Run get the next code point given tokenizer.
 | |||
|  |     get_the_next_code_point(); | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#add-a-token
 | |||
|  | void Tokenizer::add_a_token(Token::Type type, u32 next_position, u32 value_position, u32 value_length) | |||
|  | { | |||
|  |     // 1. Let token be a new token.
 | |||
|  |     Token token; | |||
|  | 
 | |||
|  |     // 2. Set token’s type to type.
 | |||
|  |     token.type = type; | |||
|  | 
 | |||
|  |     // 3. Set token’s index to tokenizer’s index.
 | |||
|  |     token.index = m_index; | |||
|  | 
 | |||
|  |     // 4. Set token’s value to the code point substring from value position with length value length within tokenizer’s input.
 | |||
|  |     token.value = MUST(String::from_utf8(m_input.unicode_substring_view(value_position, value_length).as_string())); | |||
|  | 
 | |||
|  |     // 5. Append token to the back of tokenizer’s token list.
 | |||
|  |     m_token_list.append(move(token)); | |||
|  | 
 | |||
|  |     // 5. Set tokenizer’s index to next position.
 | |||
|  |     m_index = next_position; | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
 | |||
|  | void Tokenizer::add_a_token_with_default_length(Token::Type type, u32 next_position, u32 value_position) | |||
|  | { | |||
|  |     // 1. Let computed length be next position − value position.
 | |||
|  |     auto computed_length = next_position - value_position; | |||
|  | 
 | |||
|  |     // 2. Run add a token given tokenizer, type, next position, value position, and computed length.
 | |||
|  |     add_a_token(type, next_position, value_position, computed_length); | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
 | |||
|  | void Tokenizer::add_a_token_with_default_position_and_length(Token::Type type) | |||
|  | { | |||
|  |     // 1. Run add a token with default length given tokenizer, type, tokenizer’s next index, and tokenizer’s index.
 | |||
|  |     add_a_token_with_default_length(type, m_next_index, m_index); | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
 | |||
|  | PatternErrorOr<void> Tokenizer::process_a_tokenizing_error(u32 next_position, u32 value_position) | |||
|  | { | |||
|  |     // 1. If tokenizer’s policy is "strict", then throw a TypeError.
 | |||
|  |     if (m_policy == Policy::Strict) | |||
|  |         return ErrorInfo { "Error processing a token"_string }; // FIXME: Improve this error!
 | |||
|  | 
 | |||
|  |     // 2. Assert: tokenizer’s policy is "lenient".
 | |||
|  |     VERIFY(m_policy == Policy::Lenient); | |||
|  | 
 | |||
|  |     // 3. Run add a token with default length given tokenizer, "invalid-char", next position, and value position.
 | |||
|  |     add_a_token_with_default_length(Token::Type::InvalidChar, next_position, value_position); | |||
|  | 
 | |||
|  |     return {}; | |||
|  | } | |||
|  | 
 | |||
|  | // https://urlpattern.spec.whatwg.org/#is-a-valid-name-code-point
 | |||
|  | bool Tokenizer::is_a_valid_name_code_point(u32 code_point, bool first) | |||
|  | { | |||
|  |     // 1. If first is true return the result of checking if code point is contained in the IdentifierStart set of code points.
 | |||
|  |     if (first) | |||
|  |         return code_point == '$' || code_point == '_' || Unicode::code_point_has_identifier_start_property(code_point); | |||
|  | 
 | |||
|  |     // 2. Otherwise return the result of checking if code point is contained in the IdentifierPart set of code points.
 | |||
|  |     return code_point == '$' || Unicode::code_point_has_identifier_continue_property(code_point); | |||
|  | } | |||
|  | 
 | |||
|  | } |