2021-05-25 22:13:15 +02:00
/*
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2024-03-18 16:22:27 +13:00
* Copyright ( c ) 2023 - 2024 , Shannon Booth < shannon @ serenityos . org >
2021-05-25 22:13:15 +02:00
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2023-12-16 17:49:34 +03:30
# include <AK/ByteString.h>
2021-06-01 21:18:08 +02:00
# include <AK/CharacterTypes.h>
2021-05-25 22:13:15 +02:00
# include <AK/Debug.h>
2023-07-23 21:09:29 +12:00
# include <AK/IntegralMath.h>
2021-05-25 22:13:15 +02:00
# include <AK/Optional.h>
# include <AK/SourceLocation.h>
# include <AK/StringBuilder.h>
# include <AK/StringUtils.h>
# include <AK/Utf8View.h>
2024-08-05 16:03:53 +01:00
# include <LibTextCodec/Decoder.h>
# include <LibTextCodec/Encoder.h>
2024-03-18 16:22:27 +13:00
# include <LibURL/Parser.h>
2024-03-25 10:57:36 -04:00
# include <LibUnicode/IDNA.h>
2021-05-25 22:13:15 +02:00
2024-03-18 16:22:27 +13:00
namespace URL {
2021-05-25 22:13:15 +02:00
2021-06-03 12:43:08 +02:00
// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF ;
2024-08-05 01:17:14 +12:00
// https://url.spec.whatwg.org/#forbidden-host-code-point
static bool is_forbidden_host_code_point ( u32 code_point )
{
// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR, U+0020 SPACE,
// U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>), U+003F (?), U+0040 (@), U+005B ([),
// U+005C (\), U+005D (]), U+005E (^), or U+007C (|).
return " \0 \t \n \r #/:<>?@[ \\ ]^| " sv . contains ( code_point ) ;
}
// https://url.spec.whatwg.org/#forbidden-domain-code-point
static bool is_forbidden_domain_code_point ( u32 code_point )
{
// A forbidden domain code point is a forbidden host code point, a C0 control, U+0025 (%), or U+007F DELETE.
return is_forbidden_host_code_point ( code_point ) | | is_ascii_c0_control ( code_point ) | | code_point = = ' % ' | | code_point = = 0x7F ;
}
2023-10-04 00:01:56 -07:00
// https://url.spec.whatwg.org/#url-code-points
2022-09-12 18:32:52 +02:00
static bool is_url_code_point ( u32 code_point )
2021-05-25 22:13:15 +02:00
{
2023-10-04 00:01:56 -07:00
// The URL code points are ASCII alphanumeric, U+0021 (!), U+0024 ($), U+0026 (&),
// U+0027 ('), U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS, U+002A (*),
// U+002B (+), U+002C (,), U+002D (-), U+002E (.), U+002F (/), U+003A (:),
// U+003B (;), U+003D (=), U+003F (?), U+0040 (@), U+005F (_), U+007E (~), and code
// points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and
// noncharacters.
return is_ascii_alphanumeric ( code_point ) | | " !$&'()*+,-./: ; = ? @ _ ~ " sv.contains(code_point)
| | ( code_point > = 0x00A0 & & code_point < = 0x10FFFD & & ! is_unicode_surrogate ( code_point ) & & ! is_unicode_noncharacter ( code_point ) ) ;
2021-05-25 22:13:15 +02:00
}
2021-06-03 12:03:56 +02:00
static void report_validation_error ( SourceLocation const & location = SourceLocation : : current ( ) )
2021-05-25 22:13:15 +02:00
{
2024-03-18 16:22:27 +13:00
dbgln_if ( URL_PARSER_DEBUG , " URL::Parser::basic_parse: Validation error! {} " , location ) ;
2021-05-25 22:13:15 +02:00
}
2023-10-04 00:02:06 -07:00
// https://url.spec.whatwg.org/#concept-opaque-host-parser
2024-03-18 16:22:27 +13:00
static Optional < Host > parse_opaque_host ( StringView input )
2021-05-25 22:13:15 +02:00
{
2023-10-04 00:02:06 -07:00
// 1. If input contains a forbidden host code point, host-invalid-code-point validation error, return failure.
2024-08-05 01:17:14 +12:00
for ( auto code_point : Utf8View { input } ) {
if ( is_forbidden_host_code_point ( code_point ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
}
2023-10-04 00:02:06 -07:00
// 2. If input contains a code point that is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
// 3. If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, invalid-URL-unit validation error.
// NOTE: These steps are not implemented because they are not cheap checks and exist just to report validation errors. With how we
// currently report validation errors, they are only useful for debugging efforts in the URL parsing code.
// 4. Return the result of running UTF-8 percent-encode on input using the C0 control percent-encode set.
2024-08-10 13:12:19 +12:00
return percent_encode ( input , PercentEncodeSet : : C0Control ) ;
2021-05-25 22:13:15 +02:00
}
2023-07-23 21:09:29 +12:00
struct ParsedIPv4Number {
u32 number { 0 } ;
bool validation_error { false } ;
} ;
// https://url.spec.whatwg.org/#ipv4-number-parser
static Optional < ParsedIPv4Number > parse_ipv4_number ( StringView input )
2021-05-25 22:13:15 +02:00
{
2023-07-23 21:09:29 +12:00
// 1. If input is the empty string, then return failure.
if ( input . is_empty ( ) )
return { } ;
// 2. Let validationError be false.
bool validation_error = false ;
// 3. Let R be 10.
u8 radix = 10 ;
// 4. If input contains at least two code points and the first two code points are either "0X" or "0x", then:
if ( input . length ( ) > = 2 & & ( input . starts_with ( " 0X " sv ) | | input . starts_with ( " 0x " sv ) ) ) {
// 1. Set validationError to true.
validation_error = true ;
// 2. Remove the first two code points from input.
input = input . substring_view ( 2 ) ;
// 3. Set R to 16.
radix = 16 ;
}
// 5. Otherwise, if input contains at least two code points and the first code point is U+0030 (0), then:
else if ( input . length ( ) > = 2 & & input [ 0 ] = = ' 0 ' ) {
// 1. Set validationError to true.
validation_error = true ;
// 2. Remove the first code point from input.
input = input . substring_view ( 1 ) ;
// 3. Set R to 8.
radix = 8 ;
}
// 6. If input is the empty string, then return (0, true).
if ( input . is_empty ( ) )
return ParsedIPv4Number { 0 , true } ;
// 7. If input contains a code point that is not a radix-R digit, then return failure.
if ( radix = = 8 ) {
if ( ! all_of ( input , [ ] ( auto character ) { return is_ascii_octal_digit ( character ) ; } ) )
return { } ;
} else if ( radix = = 10 ) {
if ( ! all_of ( input , [ ] ( auto character ) { return is_ascii_digit ( character ) ; } ) )
return { } ;
} else if ( radix = = 16 ) {
if ( ! all_of ( input , [ ] ( auto character ) { return is_ascii_hex_digit ( character ) ; } ) )
return { } ;
} else {
VERIFY_NOT_REACHED ( ) ;
}
// 8. Let output be the mathematical integer value that is represented by input in radix-R notation, using ASCII hex digits for digits with values 0 through 15.
2023-10-04 21:17:00 +13:00
Optional < u32 > maybe_output ;
2023-07-23 21:09:29 +12:00
if ( radix = = 8 )
2024-03-18 16:22:27 +13:00
maybe_output = AK : : StringUtils : : convert_to_uint_from_octal ( input ) ;
2023-07-23 21:09:29 +12:00
else if ( radix = = 10 )
2023-12-23 15:59:14 +13:00
maybe_output = input . to_number < u32 > ( ) ;
2023-07-23 21:09:29 +12:00
else if ( radix = = 16 )
2024-03-18 16:22:27 +13:00
maybe_output = AK : : StringUtils : : convert_to_uint_from_hex ( input ) ;
2023-07-23 21:09:29 +12:00
else
VERIFY_NOT_REACHED ( ) ;
2023-10-04 21:17:00 +13:00
// NOTE: Parsing may have failed due to overflow.
if ( ! maybe_output . has_value ( ) )
return { } ;
2023-07-23 21:09:29 +12:00
// 9. Return (output, validationError).
2023-10-04 21:17:00 +13:00
return ParsedIPv4Number { maybe_output . value ( ) , validation_error } ;
2023-07-23 21:09:29 +12:00
}
// https://url.spec.whatwg.org/#concept-ipv4-parser
2024-03-18 16:22:27 +13:00
static Optional < IPv4Address > parse_ipv4_address ( StringView input )
2023-07-23 21:09:29 +12:00
{
// 1. Let parts be the result of strictly splitting input on U+002E (.).
auto parts = input . split_view ( " . " sv , SplitBehavior : : KeepEmpty ) ;
// 2. If the last item in parts is the empty string, then:
if ( parts . last ( ) . is_empty ( ) ) {
// 1. IPv4-empty-part validation error.
report_validation_error ( ) ;
// 2. If parts’ s size is greater than 1, then remove the last item from parts.
if ( parts . size ( ) > 1 )
parts . take_last ( ) ;
}
// 3. If parts’ s size is greater than 4, IPv4-too-many-parts validation error, return failure.
if ( parts . size ( ) > 4 ) {
report_validation_error ( ) ;
return { } ;
}
// 4. Let numbers be an empty list.
Vector < u32 , 4 > numbers ;
// 5. For each part of parts:
for ( auto const & part : parts ) {
// 1. Let result be the result of parsing part.
auto const result = parse_ipv4_number ( part ) ;
// 2. If result is failure, IPv4-non-numeric-part validation error, return failure.
if ( ! result . has_value ( ) ) {
report_validation_error ( ) ;
return { } ;
}
// 3. If result[1] is true, IPv4-non-decimal-part validation error.
if ( result - > validation_error )
report_validation_error ( ) ;
// 4. Append result[0] to numbers.
numbers . append ( result - > number ) ;
}
// 6. If any item in numbers is greater than 255, IPv4-out-of-range-part validation error.
// 7. If any but the last item in numbers is greater than 255, then return failure.
for ( size_t i = 0 ; i < numbers . size ( ) ; + + i ) {
if ( numbers [ i ] > 255 ) {
report_validation_error ( ) ;
if ( i ! = numbers . size ( ) - 1 )
return { } ;
}
}
// 8. If the last item in numbers is greater than or equal to 256^(5 − numbers’ s size), then return failure.
2024-03-18 16:22:27 +13:00
if ( numbers . last ( ) > = AK : : pow < size_t > ( 256 , 5 - numbers . size ( ) ) )
2023-07-23 21:09:29 +12:00
return { } ;
// 9. Let ipv4 be the last item in numbers.
auto ipv4 = numbers . last ( ) ;
// 10. Remove the last item from numbers.
numbers . take_last ( ) ;
// 11. Let counter be 0.
u8 counter = 0 ;
// 12. For each n of numbers:
for ( u32 n : numbers ) {
// 1. Increment ipv4 by n × 256^(3 − counter).
2024-03-18 16:22:27 +13:00
ipv4 + = n * AK : : pow < size_t > ( 256 , 3 - counter ) ;
2023-07-23 21:09:29 +12:00
// 2. Increment counter by 1.
+ + counter ;
}
// 13. Return ipv4.
return ipv4 ;
}
2023-07-17 06:52:29 +02:00
// https://url.spec.whatwg.org/#concept-ipv6-parser
2024-03-18 16:22:27 +13:00
static Optional < IPv6Address > parse_ipv6_address ( StringView input )
2023-07-17 06:52:29 +02:00
{
// 1. Let address be a new IPv6 address whose IPv6 pieces are all 0.
Array < u16 , 8 > address { } ;
// 2. Let pieceIndex be 0.
size_t piece_index = 0 ;
// 3. Let compress be null.
Optional < size_t > compress ;
Vector < u32 > code_points ;
code_points . ensure_capacity ( input . length ( ) ) ;
for ( auto code_point : Utf8View { input } ) {
code_points . append ( code_point ) ;
}
// 4. Let pointer be a pointer for input.
size_t pointer = 0 ;
auto c = [ & ] ( ) - > u32 {
if ( pointer > = code_points . size ( ) )
return end_of_file ;
return code_points [ pointer ] ;
} ;
auto remaining = [ & ] ( ) - > ReadonlySpan < u32 > {
if ( ( pointer + 1 ) > = code_points . size ( ) )
return { } ;
return code_points . span ( ) . slice ( pointer + 1 ) ;
} ;
// 5. If c is U+003A (:), then:
if ( c ( ) = = ' : ' ) {
// 1. If remaining does not start with U+003A (:), IPv6-invalid-compression validation error, return failure.
if ( remaining ( ) . is_empty ( ) | | remaining ( ) [ 0 ] ! = ' : ' ) {
report_validation_error ( ) ;
return { } ;
}
// 2. Increase pointer by 2.
pointer + = 2 ;
// 3. Increase pieceIndex by 1 and then set compress to pieceIndex.
+ + piece_index ;
compress = piece_index ;
}
// 6. While c is not the EOF code point:
while ( c ( ) ! = end_of_file ) {
// 1. If pieceIndex is 8, IPv6-too-many-pieces validation error, return failure.
if ( piece_index = = 8 ) {
report_validation_error ( ) ;
return { } ;
}
// 2. If c is U+003A (:), then:
if ( c ( ) = = ' : ' ) {
// 1. If compress is non-null, IPv6-multiple-compression validation error, return failure.
if ( compress . has_value ( ) ) {
report_validation_error ( ) ;
return { } ;
}
// 2. Increase pointer and pieceIndex by 1, set compress to pieceIndex, and then continue.
+ + pointer ;
+ + piece_index ;
compress = piece_index ;
continue ;
}
// 3. Let value and length be 0.
u32 value = 0 ;
size_t length = 0 ;
// 4. While length is less than 4 and c is an ASCII hex digit,
// set value to value × 0x10 + c interpreted as hexadecimal number,
// and increase pointer and length by 1.
while ( length < 4 & & is_ascii_hex_digit ( c ( ) ) ) {
value = value * 0x10 + parse_ascii_hex_digit ( c ( ) ) ;
+ + pointer ;
+ + length ;
}
// 5. If c is U+002E (.), then:
if ( c ( ) = = ' . ' ) {
// 1. If length is 0, IPv4-in-IPv6-invalid-code-point validation error, return failure.
if ( length = = 0 ) {
report_validation_error ( ) ;
return { } ;
}
// 2. Decrease pointer by length.
pointer - = length ;
// 3. If pieceIndex is greater than 6, IPv4-in-IPv6-too-many-pieces validation error, return failure.
if ( piece_index > 6 ) {
report_validation_error ( ) ;
return { } ;
}
// 4. Let numbersSeen be 0.
size_t numbers_seen = 0 ;
// 5. While c is not the EOF code point:
while ( c ( ) ! = end_of_file ) {
// 1. Let ipv4Piece be null.
Optional < u32 > ipv4_piece ;
// 2. If numbersSeen is greater than 0, then:
if ( numbers_seen > 0 ) {
// 1. If c is a U+002E (.) and numbersSeen is less than 4, then increase pointer by 1.
if ( c ( ) = = ' . ' & & numbers_seen < 4 ) {
+ + pointer ;
}
// 2. Otherwise, IPv4-in-IPv6-invalid-code-point validation error, return failure.
else {
report_validation_error ( ) ;
return { } ;
}
}
// 3. If c is not an ASCII digit, IPv4-in-IPv6-invalid-code-point validation error, return failure.
if ( ! is_ascii_digit ( c ( ) ) ) {
report_validation_error ( ) ;
return { } ;
}
// 4. While c is an ASCII digit:
while ( is_ascii_digit ( c ( ) ) ) {
// 1. Let number be c interpreted as decimal number.
u32 number = parse_ascii_digit ( c ( ) ) ;
// 2. If ipv4Piece is null, then set ipv4Piece to number.
if ( ! ipv4_piece . has_value ( ) ) {
ipv4_piece = number ;
}
// Otherwise, if ipv4Piece is 0, IPv4-in-IPv6-invalid-code-point validation error, return failure.
else if ( ipv4_piece . value ( ) = = 0 ) {
report_validation_error ( ) ;
return { } ;
}
// Otherwise, set ipv4Piece to ipv4Piece × 10 + number.
else {
ipv4_piece = ipv4_piece . value ( ) * 10 + number ;
}
// 3. If ipv4Piece is greater than 255, IPv4-in-IPv6-out-of-range-part validation error, return failure.
if ( ipv4_piece . value ( ) > 255 ) {
report_validation_error ( ) ;
return { } ;
}
// 4. Increase pointer by 1.
+ + pointer ;
}
// 5. Set address[pieceIndex] to address[pieceIndex] × 0x100 + ipv4Piece.
address [ piece_index ] = address [ piece_index ] * 0x100 + ipv4_piece . value ( ) ;
// 6. Increase numbersSeen by 1.
+ + numbers_seen ;
// 7. If numbersSeen is 2 or 4, then increase pieceIndex by 1.
if ( numbers_seen = = 2 | | numbers_seen = = 4 )
+ + piece_index ;
}
// 6. If numbersSeen is not 4, IPv4-in-IPv6-too-few-parts validation error, return failure.
if ( numbers_seen ! = 4 ) {
report_validation_error ( ) ;
return { } ;
}
// 7. Break.
break ;
}
// 6. Otherwise, if c is U+003A (:):
else if ( c ( ) = = ' : ' ) {
// 1. Increase pointer by 1.
+ + pointer ;
// 2. If c is the EOF code point, IPv6-invalid-code-point validation error, return failure.
if ( c ( ) = = end_of_file ) {
report_validation_error ( ) ;
return { } ;
}
}
// 7. Otherwise, if c is not the EOF code point, IPv6-invalid-code-point validation error, return failure.
else if ( c ( ) ! = end_of_file ) {
report_validation_error ( ) ;
return { } ;
}
// 8. Set address[pieceIndex] to value.
address [ piece_index ] = value ;
// 9. Increase pieceIndex by 1.
+ + piece_index ;
}
// 7. If compress is non-null, then:
if ( compress . has_value ( ) ) {
// 1. Let swaps be pieceIndex − compress.
size_t swaps = piece_index - compress . value ( ) ;
// 2. Set pieceIndex to 7.
piece_index = 7 ;
// 3. While pieceIndex is not 0 and swaps is greater than 0,
// swap address[pieceIndex] with address[compress + swaps − 1],
// and then decrease both pieceIndex and swaps by 1.
while ( piece_index ! = 0 & & swaps > 0 ) {
swap ( address [ piece_index ] , address [ compress . value ( ) + swaps - 1 ] ) ;
- - piece_index ;
- - swaps ;
}
}
// 8. Otherwise, if compress is null and pieceIndex is not 8, IPv6-too-few-pieces validation error, return failure.
else if ( ! compress . has_value ( ) & & piece_index ! = 8 ) {
report_validation_error ( ) ;
return { } ;
}
// 9. Return address.
return address ;
}
2023-07-25 19:43:00 +12:00
// https://url.spec.whatwg.org/#ends-in-a-number-checker
static bool ends_in_a_number_checker ( StringView input )
{
// 1. Let parts be the result of strictly splitting input on U+002E (.).
auto parts = input . split_view ( " . " sv , SplitBehavior : : KeepEmpty ) ;
// 2. If the last item in parts is the empty string, then:
if ( parts . last ( ) . is_empty ( ) ) {
// 1. If parts’ s size is 1, then return false.
if ( parts . size ( ) = = 1 )
return false ;
// 2. Remove the last item from parts.
parts . take_last ( ) ;
}
// 3. Let last be the last item in parts.
auto last = parts . last ( ) ;
// 4. If last is non-empty and contains only ASCII digits, then return true.
if ( ! last . is_empty ( ) & & all_of ( last , is_ascii_digit ) )
return true ;
// 5. If parsing last as an IPv4 number does not return failure, then return true.
2024-08-07 03:06:57 +12:00
// NOTE: This is equivalent to checking that last is "0X" or "0x", followed by zero or more ASCII hex digits.
if ( last . starts_with ( " 0x " sv , CaseSensitivity : : CaseInsensitive ) & & all_of ( last . substring_view ( 2 ) , is_ascii_hex_digit ) )
2023-07-25 19:43:00 +12:00
return true ;
// 6. Return false.
return false ;
}
2024-03-25 10:57:36 -04:00
// https://url.spec.whatwg.org/#concept-domain-to-ascii
static ErrorOr < String > domain_to_ascii ( StringView domain , bool be_strict )
{
2024-12-01 22:36:04 +13:00
// 1. Let result be the result of running Unicode ToASCII with domain_name set to domain,
// CheckHyphens set to beStrict,
// CheckBidi set to true,
// CheckJoiners set to true,
// UseSTD3ASCIIRules set to beStrict,
// Transitional_Processing set to false,
// VerifyDnsLength set to beStrict. [UTS46].
2024-03-25 10:57:36 -04:00
// 2. If result is a failure value, domain-to-ASCII validation error, return failure.
2024-04-05 21:26:52 +02:00
2024-08-07 02:47:46 +12:00
// OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.)
// does not produce any item that starts with an ASCII case-insensitive match for "xn--", this
// step is equivalent to ASCII lowercasing domain.
if ( ! be_strict & & all_of ( domain , is_ascii ) ) {
2024-04-05 21:26:52 +02:00
// 3. If result is the empty string, domain-to-ASCII validation error, return failure.
if ( domain . is_empty ( ) )
return Error : : from_string_literal ( " Empty domain " ) ;
2024-06-10 09:22:56 +01:00
2024-08-07 02:47:46 +12:00
bool slow_path = false ;
for ( auto part : domain . split_view ( ' . ' ) ) {
if ( part . starts_with ( " xn-- " sv , CaseSensitivity : : CaseInsensitive ) ) {
slow_path = true ;
break ;
}
}
if ( ! slow_path ) {
auto lowercase_domain = domain . to_lowercase_string ( ) ;
return String : : from_utf8_without_validation ( lowercase_domain . bytes ( ) ) ;
}
2024-04-05 21:26:52 +02:00
}
2024-03-25 10:57:36 -04:00
Unicode : : IDNA : : ToAsciiOptions const options {
2024-12-01 22:36:04 +13:00
be_strict ? Unicode : : IDNA : : CheckHyphens : : Yes : Unicode : : IDNA : : CheckHyphens : : No ,
2024-03-25 10:57:36 -04:00
Unicode : : IDNA : : CheckBidi : : Yes ,
Unicode : : IDNA : : CheckJoiners : : Yes ,
be_strict ? Unicode : : IDNA : : UseStd3AsciiRules : : Yes : Unicode : : IDNA : : UseStd3AsciiRules : : No ,
Unicode : : IDNA : : TransitionalProcessing : : No ,
be_strict ? Unicode : : IDNA : : VerifyDnsLength : : Yes : Unicode : : IDNA : : VerifyDnsLength : : No
} ;
auto result = TRY ( Unicode : : IDNA : : to_ascii ( Utf8View ( domain ) , options ) ) ;
// 3. If result is the empty string, domain-to-ASCII validation error, return failure.
if ( result . is_empty ( ) )
return Error : : from_string_literal ( " Empty domain " ) ;
// 4. Return result.
return result ;
}
2021-05-25 22:13:15 +02:00
// https://url.spec.whatwg.org/#concept-host-parser
2024-03-18 16:22:27 +13:00
static Optional < Host > parse_host ( StringView input , bool is_opaque = false )
2021-05-25 22:13:15 +02:00
{
2023-07-17 06:52:29 +02:00
// 1. If input starts with U+005B ([), then:
2021-05-25 22:13:15 +02:00
if ( input . starts_with ( ' [ ' ) ) {
2023-07-17 06:52:29 +02:00
// 1. If input does not end with U+005D (]), IPv6-unclosed validation error, return failure.
2021-05-25 22:13:15 +02:00
if ( ! input . ends_with ( ' ] ' ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-17 06:52:29 +02:00
// 2. Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
auto address = parse_ipv6_address ( input . substring_view ( 1 , input . length ( ) - 2 ) ) ;
if ( ! address . has_value ( ) )
return { } ;
2024-11-27 15:12:17 +00:00
return Host { address . release_value ( ) } ;
2021-05-25 22:13:15 +02:00
}
2023-09-30 23:07:03 -07:00
// 2. If isOpaque is true, then return the result of opaque-host parsing input.
if ( is_opaque )
2021-05-25 22:13:15 +02:00
return parse_opaque_host ( input ) ;
2023-07-23 21:09:29 +12:00
// 3. Assert: input is not the empty string.
2021-05-25 22:13:15 +02:00
VERIFY ( ! input . is_empty ( ) ) ;
2023-07-23 21:09:29 +12:00
// FIXME: 4. Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
2024-03-18 16:22:27 +13:00
auto domain = percent_decode ( input ) ;
2023-07-23 21:09:29 +12:00
2024-03-25 10:57:36 -04:00
// 5. Let asciiDomain be the result of running domain to ASCII with domain and false.
auto ascii_domain_or_error = domain_to_ascii ( domain , false ) ;
// 6. If asciiDomain is failure, then return failure.
2023-07-27 21:40:41 +12:00
if ( ascii_domain_or_error . is_error ( ) )
return { } ;
auto ascii_domain = ascii_domain_or_error . release_value ( ) ;
2021-05-25 22:13:15 +02:00
2023-07-23 21:09:29 +12:00
// 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point validation error, return failure.
2024-08-05 01:17:14 +12:00
for ( auto character : ascii_domain . bytes_as_string_view ( ) ) {
if ( is_forbidden_domain_code_point ( character ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
}
2023-07-23 21:09:29 +12:00
// 8. If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain.
2023-07-25 19:43:00 +12:00
if ( ends_in_a_number_checker ( ascii_domain ) ) {
2023-07-23 21:09:29 +12:00
auto ipv4_host = parse_ipv4_address ( ascii_domain ) ;
if ( ! ipv4_host . has_value ( ) )
return { } ;
2024-11-27 15:12:17 +00:00
return Host { ipv4_host . release_value ( ) } ;
2023-07-23 21:09:29 +12:00
}
// 9. Return asciiDomain.
return ascii_domain ;
2021-05-25 22:13:15 +02:00
}
2022-09-20 15:38:53 +02:00
// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
2021-11-11 00:55:02 +01:00
constexpr bool starts_with_windows_drive_letter ( StringView input )
2021-05-25 22:13:15 +02:00
{
if ( input . length ( ) < 2 )
return false ;
2022-09-20 15:38:53 +02:00
if ( ! is_ascii_alpha ( input [ 0 ] ) | | ! ( input [ 1 ] = = ' : ' | | input [ 1 ] = = ' | ' ) )
2021-05-25 22:13:15 +02:00
return false ;
if ( input . length ( ) = = 2 )
return true ;
return " / \\ ?# " sv . contains ( input [ 2 ] ) ;
}
2021-11-11 00:55:02 +01:00
constexpr bool is_windows_drive_letter ( StringView input )
2021-05-25 22:13:15 +02:00
{
return input . length ( ) = = 2 & & is_ascii_alpha ( input [ 0 ] ) & & ( input [ 1 ] = = ' : ' | | input [ 1 ] = = ' | ' ) ;
}
2021-11-11 00:55:02 +01:00
constexpr bool is_normalized_windows_drive_letter ( StringView input )
2021-05-25 22:13:15 +02:00
{
return input . length ( ) = = 2 & & is_ascii_alpha ( input [ 0 ] ) & & input [ 1 ] = = ' : ' ;
}
2021-11-11 00:55:02 +01:00
constexpr bool is_single_dot_path_segment ( StringView input )
2021-05-25 22:13:15 +02:00
{
2023-03-10 08:48:54 +01:00
return input = = " . " sv | | input . equals_ignoring_ascii_case ( " %2e " sv ) ;
2021-05-25 22:13:15 +02:00
}
2021-11-11 00:55:02 +01:00
constexpr bool is_double_dot_path_segment ( StringView input )
2021-05-25 22:13:15 +02:00
{
2023-03-10 08:48:54 +01:00
return input = = " .. " sv | | input . equals_ignoring_ascii_case ( " .%2e " sv ) | | input . equals_ignoring_ascii_case ( " %2e. " sv ) | | input . equals_ignoring_ascii_case ( " %2e%2e " sv ) ;
2021-05-25 22:13:15 +02:00
}
2023-09-17 13:15:52 +12:00
// https://url.spec.whatwg.org/#shorten-a-urls-path
2024-03-18 16:22:27 +13:00
void Parser : : shorten_urls_path ( URL & url )
2023-09-17 13:15:52 +12:00
{
// 1. Assert: url does not have an opaque path.
VERIFY ( ! url . cannot_be_a_base_url ( ) ) ;
// 2. Let path be url’ s path.
2024-08-02 15:23:49 +02:00
auto & path = url . m_data - > paths ;
2023-09-17 13:15:52 +12:00
// 3. If url’ s scheme is "file", path’ s size is 1, and path[0] is a normalized Windows drive letter, then return.
if ( url . scheme ( ) = = " file " & & path . size ( ) = = 1 & & is_normalized_windows_drive_letter ( path [ 0 ] ) )
return ;
// 4. Remove path’ s last item, if any.
if ( ! path . is_empty ( ) )
path . take_last ( ) ;
}
2022-04-10 00:48:15 +02:00
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
2024-08-11 00:05:22 +12:00
String Parser : : percent_encode_after_encoding ( TextCodec : : Encoder & encoder , StringView input , PercentEncodeSet percent_encode_set , bool space_as_plus )
2022-04-10 00:48:15 +02:00
{
2024-08-05 16:03:53 +01:00
// 1. Let encodeOutput be an empty I/O queue.
2022-04-10 00:48:15 +02:00
StringBuilder output ;
2024-08-06 11:06:05 +01:00
// 2. Set potentialError to the result of running encode or fail with inputQueue, encoder, and encodeOutput.
2024-08-11 00:05:22 +12:00
MUST ( encoder . process (
2024-08-06 11:06:05 +01:00
Utf8View ( input ) ,
// 3. For each byte of encodeOutput converted to a byte sequence:
[ & ] ( u8 byte ) - > ErrorOr < void > {
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
if ( space_as_plus & & byte = = ' ' ) {
output . append ( ' + ' ) ;
return { } ;
}
2022-04-10 00:48:15 +02:00
2024-08-06 11:06:05 +01:00
// 2. Let isomorph be a code point whose value is byte’ s value.
u32 isomorph = byte ;
2022-04-10 00:48:15 +02:00
2024-08-06 11:06:05 +01:00
// 3. Assert: percentEncodeSet includes all non-ASCII code points.
2022-04-10 00:48:15 +02:00
2024-08-06 11:06:05 +01:00
// 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
if ( ! code_point_is_in_percent_encode_set ( isomorph , percent_encode_set ) ) {
output . append_code_point ( isomorph ) ;
}
2022-04-10 00:48:15 +02:00
2024-08-06 11:06:05 +01:00
// 5. Otherwise, percent-encode byte and append the result to output.
else {
output . appendff ( " %{:02X} " , byte ) ;
}
2024-08-05 16:03:53 +01:00
2024-08-06 11:06:05 +01:00
return { } ;
} ,
// 4. If potentialError is non-null, then append "%26%23", followed by the shortest sequence of ASCII digits
// representing potentialError in base ten, followed by "%3B", to output.
[ & ] ( u32 error ) - > ErrorOr < void > {
output . appendff ( " %26%23{}%3B " , error ) ;
return { } ;
} ) ) ;
2022-04-10 00:48:15 +02:00
// 6. Return output.
2024-08-11 00:05:22 +12:00
return MUST ( output . to_string ( ) ) ;
2022-04-10 00:48:15 +02:00
}
2021-05-25 22:13:15 +02:00
// https://url.spec.whatwg.org/#concept-basic-url-parser
2024-11-25 13:23:31 +01:00
URL Parser : : basic_parse ( StringView raw_input , Optional < URL const & > base_url , URL * url , Optional < State > state_override , Optional < StringView > encoding )
2021-05-25 22:13:15 +02:00
{
2024-08-06 22:29:45 +12:00
dbgln_if ( URL_PARSER_DEBUG , " URL::Parser::basic_parse: Parsing '{}' " , raw_input ) ;
2021-05-25 22:13:15 +02:00
size_t start_index = 0 ;
size_t end_index = raw_input . length ( ) ;
2023-07-03 22:52:08 +12:00
// 1. If url is not given:
2024-08-13 19:18:50 +12:00
auto url_buffer = URL ( ) ;
if ( ! url ) {
2023-07-03 22:52:08 +12:00
// 1. Set url to a new URL.
2024-08-13 19:18:50 +12:00
url = & url_buffer ;
2021-09-13 22:34:14 +03:00
2023-07-03 22:52:08 +12:00
// 2. If input contains any leading or trailing C0 control or space, invalid-URL-unit validation error.
// 3. Remove any leading and trailing C0 control or space from input.
2021-09-13 22:34:14 +03:00
bool has_validation_error = false ;
2024-08-06 02:00:52 +12:00
for ( ; start_index < raw_input . length ( ) ; + + start_index ) {
if ( ! is_ascii_c0_control_or_space ( raw_input [ start_index ] ) )
2021-09-13 22:34:14 +03:00
break ;
2024-08-06 02:00:52 +12:00
has_validation_error = true ;
2021-05-25 22:13:15 +02:00
}
2024-08-06 02:00:52 +12:00
for ( ; end_index > start_index ; - - end_index ) {
if ( ! is_ascii_c0_control_or_space ( raw_input [ end_index - 1 ] ) )
2021-09-13 22:34:14 +03:00
break ;
2024-08-06 02:00:52 +12:00
has_validation_error = true ;
2021-05-25 22:13:15 +02:00
}
2024-08-06 02:00:52 +12:00
2021-09-13 22:34:14 +03:00
if ( has_validation_error )
report_validation_error ( ) ;
2021-05-25 22:13:15 +02:00
}
2023-12-16 17:49:34 +03:30
ByteString processed_input = raw_input . substring_view ( start_index , end_index - start_index ) ;
2021-05-25 22:13:15 +02:00
2023-07-03 22:52:08 +12:00
// 2. If input contains any ASCII tab or newline, invalid-URL-unit validation error.
// 3. Remove all ASCII tab or newline from input.
2023-12-29 18:33:10 +01:00
for ( auto const ch : processed_input ) {
2024-08-05 19:44:12 +12:00
if ( ch = = ' \t ' | | ch = = ' \n ' | | ch = = ' \r ' ) {
2023-12-29 18:33:10 +01:00
report_validation_error ( ) ;
2024-08-05 19:44:12 +12:00
processed_input = processed_input . replace ( " \t " sv , " " sv , ReplaceMode : : All ) . replace ( " \n " sv , " " sv , ReplaceMode : : All ) . replace ( " \r " sv , " " sv , ReplaceMode : : All ) ;
2023-12-29 18:33:10 +01:00
break ;
}
2021-05-25 22:13:15 +02:00
}
2023-07-03 22:52:08 +12:00
// 4. Let state be state override if given, or scheme start state otherwise.
2021-09-13 22:34:14 +03:00
State state = state_override . value_or ( State : : SchemeStart ) ;
2023-07-03 22:52:08 +12:00
2024-08-05 16:03:53 +01:00
// 5. Set encoding to the result of getting an output encoding from encoding.
2024-08-05 21:47:45 +01:00
Optional < TextCodec : : Encoder & > encoder = { } ;
if ( encoding . has_value ( ) )
encoder = TextCodec : : encoder_for ( TextCodec : : get_output_encoding ( * encoding ) ) ;
if ( ! encoder . has_value ( ) )
encoder = TextCodec : : encoder_for ( " utf-8 " sv ) ;
2024-08-05 16:03:53 +01:00
VERIFY ( encoder . has_value ( ) ) ;
2023-07-03 22:52:08 +12:00
// 6. Let buffer be the empty string.
2021-05-25 22:13:15 +02:00
StringBuilder buffer ;
2023-07-03 22:52:08 +12:00
// 7. Let atSignSeen, insideBrackets, and passwordTokenSeen be false.
2021-05-25 22:13:15 +02:00
bool at_sign_seen = false ;
bool inside_brackets = false ;
bool password_token_seen = false ;
Utf8View input ( processed_input ) ;
2023-07-03 22:52:08 +12:00
// 8. Let pointer be a pointer for input.
2021-06-01 09:45:52 +02:00
Utf8CodePointIterator iterator = input . begin ( ) ;
2021-05-25 22:13:15 +02:00
auto get_remaining = [ & input , & iterator ] {
2021-05-30 18:52:24 +02:00
return input . substring_view ( iterator - input . begin ( ) + iterator . underlying_code_point_length_in_bytes ( ) ) . as_string ( ) ;
2021-05-25 22:13:15 +02:00
} ;
2023-08-13 11:17:02 +12:00
auto remaining_starts_with_two_ascii_hex_digits = [ & ] ( ) {
return is_ascii_hex_digit ( iterator . peek ( 1 ) . value_or ( end_of_file ) ) & & is_ascii_hex_digit ( iterator . peek ( 2 ) . value_or ( end_of_file ) ) ;
} ;
2023-07-03 22:52:08 +12:00
// 9. Keep running the following state machine by switching on state. If after a run pointer points to the EOF code point, go to the next step. Otherwise, increase pointer by 1 and continue with the state machine.
2021-05-25 22:13:15 +02:00
// NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
// ++iterator : "increase pointer by 1"
// continue : "decrease pointer by 1"
for ( ; ; ) {
2021-06-03 12:43:08 +02:00
u32 code_point = end_of_file ;
2021-05-25 22:13:15 +02:00
if ( ! iterator . done ( ) )
code_point = * iterator ;
if constexpr ( URL_PARSER_DEBUG ) {
2021-06-03 12:43:08 +02:00
if ( code_point = = end_of_file )
2024-03-18 16:22:27 +13:00
dbgln ( " URL::Parser::basic_parse: {} state with EOF. " , state_name ( state ) ) ;
2021-06-03 12:40:04 +02:00
else if ( is_ascii_printable ( code_point ) )
2024-03-18 16:22:27 +13:00
dbgln ( " URL::Parser::basic_parse: {} state with code point U+{:04X} ({:c}). " , state_name ( state ) , code_point , code_point ) ;
2021-05-25 22:13:15 +02:00
else
2024-03-18 16:22:27 +13:00
dbgln ( " URL::Parser::basic_parse: {} state with code point U+{:04X}. " , state_name ( state ) , code_point ) ;
2021-05-25 22:13:15 +02:00
}
switch ( state ) {
2023-07-03 22:52:08 +12:00
// -> scheme start state, https://url.spec.whatwg.org/#scheme-start-state
2021-05-25 22:13:15 +02:00
case State : : SchemeStart :
2023-07-03 22:52:08 +12:00
// 1. If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
2021-05-25 22:13:15 +02:00
if ( is_ascii_alpha ( code_point ) ) {
buffer . append_as_lowercase ( code_point ) ;
state = State : : Scheme ;
2023-07-03 22:52:08 +12:00
}
2023-07-04 20:34:00 +12:00
// 2. Otherwise, if state override is not given, set state to no scheme state and decrease pointer by 1.
else if ( ! state_override . has_value ( ) ) {
2021-05-25 22:13:15 +02:00
state = State : : NoScheme ;
continue ;
}
2023-07-04 20:34:00 +12:00
// 3. Otherwise, return failure.
else {
return { } ;
}
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> scheme state, https://url.spec.whatwg.org/#scheme-state
2021-05-25 22:13:15 +02:00
case State : : Scheme :
2023-07-03 22:52:08 +12:00
// 1. If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer.
2021-05-25 22:13:15 +02:00
if ( is_ascii_alphanumeric ( code_point ) | | code_point = = ' + ' | | code_point = = ' - ' | | code_point = = ' . ' ) {
buffer . append_as_lowercase ( code_point ) ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if c is U+003A (:), then:
else if ( code_point = = ' : ' ) {
2023-07-14 12:58:16 +12:00
// 1. If state override is given, then:
if ( state_override . has_value ( ) ) {
// 1. If url’ s scheme is a special scheme and buffer is not a special scheme, then return.
2024-03-18 16:22:27 +13:00
if ( is_special_scheme ( url - > scheme ( ) ) & & ! is_special_scheme ( buffer . string_view ( ) ) )
2023-07-14 12:58:16 +12:00
return * url ;
// 2. If url’ s scheme is not a special scheme and buffer is a special scheme, then return.
2024-03-18 16:22:27 +13:00
if ( ! is_special_scheme ( url - > scheme ( ) ) & & is_special_scheme ( buffer . string_view ( ) ) )
2023-07-14 12:58:16 +12:00
return * url ;
// 3. If url includes credentials or has a non-null port, and buffer is "file", then return.
if ( ( url - > includes_credentials ( ) | | url - > port ( ) . has_value ( ) ) & & buffer . string_view ( ) = = " file " sv )
return * url ;
// 4. If url’ s scheme is "file" and its host is an empty host, then return.
2024-11-27 15:12:17 +00:00
if ( url - > scheme ( ) = = " file " sv & & url - > host ( ) . has_value ( ) & & url - > host ( ) - > is_empty_host ( ) )
2023-07-14 12:58:16 +12:00
return * url ;
2023-07-03 22:52:08 +12:00
}
// 2. Set url’ s scheme to buffer.
2024-08-02 15:23:49 +02:00
url - > m_data - > scheme = buffer . to_string_without_validation ( ) ;
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 3. If state override is given, then:
if ( state_override . has_value ( ) ) {
// 1. If url’ s port is url’ s scheme’ s default port, then set url’ s port to null.
2024-03-18 16:22:27 +13:00
if ( url - > port ( ) = = default_port_for_scheme ( url - > scheme ( ) ) )
2024-08-02 15:23:49 +02:00
url - > m_data - > port = { } ;
2023-07-14 12:58:16 +12:00
// 2. Return.
return * url ;
2023-07-03 22:52:08 +12:00
}
// 4. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 5. If url’ s scheme is "file", then:
2021-09-13 22:34:14 +03:00
if ( url - > scheme ( ) = = " file " ) {
2023-07-03 22:52:08 +12:00
// 1. If remaining does not start with "//", special-scheme-missing-following-solidus validation error.
2022-07-11 17:32:29 +00:00
if ( ! get_remaining ( ) . starts_with ( " // " sv ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
}
2023-07-03 22:52:08 +12:00
// 2. Set state to file state.
2021-05-25 22:13:15 +02:00
state = State : : File ;
2023-07-03 22:52:08 +12:00
}
// 6. Otherwise, if url is special, base is non-null, and base’ s scheme is url’ s scheme:
2024-08-02 15:23:49 +02:00
else if ( url - > is_special ( ) & & base_url . has_value ( ) & & base_url - > scheme ( ) = = url - > m_data - > scheme ) {
2023-08-13 11:08:12 +12:00
// 1. Assert: base is is special (and therefore does not have an opaque path).
VERIFY ( base_url - > is_special ( ) ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to special relative or authority state.
2023-08-13 11:08:12 +12:00
state = State : : SpecialRelativeOrAuthority ;
}
// 7. Otherwise, if url is special, set state to special authority slashes state.
else if ( url - > is_special ( ) ) {
state = State : : SpecialAuthoritySlashes ;
2023-07-03 22:52:08 +12:00
}
// 8. Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by 1.
else if ( get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 22:13:15 +02:00
state = State : : PathOrAuthority ;
+ + iterator ;
2023-07-03 22:52:08 +12:00
}
// 9. Otherwise, set url’ s path to the empty string and set state to opaque path state.
else {
2024-08-02 15:23:49 +02:00
url - > m_data - > cannot_be_a_base_url = true ;
2023-04-09 14:21:00 +01:00
url - > append_slash ( ) ;
2021-05-25 22:13:15 +02:00
state = State : : CannotBeABaseUrlPath ;
}
2023-07-03 22:52:08 +12:00
}
2023-07-04 20:34:00 +12:00
// 3. Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input).
else if ( ! state_override . has_value ( ) ) {
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : NoScheme ;
iterator = input . begin ( ) ;
continue ;
}
2023-07-04 20:34:00 +12:00
// 4. Otherwise, return failure.
else {
return { } ;
}
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> no scheme state, https://url.spec.whatwg.org/#no-scheme-state
2021-05-25 22:13:15 +02:00
case State : : NoScheme :
2023-07-03 22:52:08 +12:00
// 1. If base is null, or base has an opaque path and c is not U+0023 (#), missing-scheme-non-relative-URL validation error, return failure.
2024-08-02 15:23:49 +02:00
if ( ! base_url . has_value ( ) | | ( base_url - > m_data - > cannot_be_a_base_url & & code_point ! = ' # ' ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if base has an opaque path and c is U+0023 (#), set url’ s scheme to base’ s scheme, url’ s path to base’ s path, url’ s query
// to base’ s query,url’ s fragment to the empty string, and set state to fragment state.
2024-08-02 15:23:49 +02:00
else if ( base_url - > m_data - > cannot_be_a_base_url & & code_point = = ' # ' ) {
url - > m_data - > scheme = base_url - > m_data - > scheme ;
url - > m_data - > paths = base_url - > m_data - > paths ;
url - > m_data - > query = base_url - > m_data - > query ;
url - > m_data - > fragment = String { } ;
url - > m_data - > cannot_be_a_base_url = true ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if base’ s scheme is not "file", set state to relative state and decrease pointer by 1.
2024-08-02 15:23:49 +02:00
else if ( base_url - > m_data - > scheme ! = " file " ) {
2021-05-25 22:13:15 +02:00
state = State : : Relative ;
continue ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, set state to file state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
state = State : : File ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> special relative or authority state, https://url.spec.whatwg.org/#special-relative-or-authority-state
2021-05-25 22:13:15 +02:00
case State : : SpecialRelativeOrAuthority :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
2022-07-11 17:32:29 +00:00
if ( code_point = = ' / ' & & get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 22:13:15 +02:00
state = State : : SpecialAuthorityIgnoreSlashes ;
+ + iterator ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to relative state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : Relative ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> path or authority state, https://url.spec.whatwg.org/#path-or-authority-state
2021-05-25 22:13:15 +02:00
case State : : PathOrAuthority :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/), then set state to authority state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' ) {
state = State : : Authority ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, set state to path state, and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> relative state, https://url.spec.whatwg.org/#relative-state
2021-05-25 22:13:15 +02:00
case State : : Relative :
2023-07-04 21:11:42 +12:00
// 1. Assert: base’ s scheme is not "file".
VERIFY ( base_url - > scheme ( ) ! = " file " ) ;
2023-07-03 22:52:08 +12:00
// 2. Set url’ s scheme to base’ s scheme.
2024-08-02 15:23:49 +02:00
url - > m_data - > scheme = base_url - > m_data - > scheme ;
2023-07-03 22:52:08 +12:00
// 3. If c is U+002F (/), then set state to relative slash state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' ) {
state = State : : RelativeSlash ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if url is special and c is U+005C (\), invalid-reverse-solidus validation error, set state to relative slash state.
else if ( url - > is_special ( ) & & code_point = = ' \\ ' ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : RelativeSlash ;
2023-07-03 22:52:08 +12:00
}
// 5. Otherwise:
else {
// 1. Set url’ s username to base’ s username, url’ s password to base’ s password, url’ s host to base’ s host, url’ s port to base’ s port, url’ s path to a clone of base’ s path, and url’ s query to base’ s query.
2024-08-02 15:23:49 +02:00
url - > m_data - > username = base_url - > m_data - > username ;
url - > m_data - > password = base_url - > m_data - > password ;
url - > m_data - > host = base_url - > m_data - > host ;
url - > m_data - > port = base_url - > m_data - > port ;
url - > m_data - > paths = base_url - > m_data - > paths ;
url - > m_data - > query = base_url - > m_data - > query ;
2021-05-25 22:13:15 +02:00
2023-07-03 22:52:08 +12:00
// 2. If c is U+003F (?), then set url’ s query to the empty string, and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > query = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > fragment = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set url’ s query to null.
2024-08-02 15:23:49 +02:00
url - > m_data - > query = { } ;
2023-07-03 22:52:08 +12:00
// 2. Shorten url’ s path.
2023-09-17 13:15:52 +12:00
shorten_urls_path ( * url ) ;
2023-07-03 22:52:08 +12:00
// 3. Set state to path state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
}
break ;
2023-07-03 22:52:08 +12:00
// -> relative slash state, https://url.spec.whatwg.org/#relative-slash-state
2021-05-25 22:13:15 +02:00
case State : : RelativeSlash :
2023-07-03 22:52:08 +12:00
// 1. If url is special and c is U+002F (/) or U+005C (\), then:
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & ( code_point = = ' / ' | | code_point = = ' \\ ' ) ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to special authority ignore slashes state.
2021-05-25 22:13:15 +02:00
state = State : : SpecialAuthorityIgnoreSlashes ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if c is U+002F (/), then set state to authority state.
else if ( code_point = = ' / ' ) {
2021-05-25 22:13:15 +02:00
state = State : : Authority ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, set url’ s username to base’ s username, url’ s password to base’ s password, url’ s host to base’ s host, url’ s port to base’ s port, state to path state, and then, decrease pointer by 1.
else {
2024-08-02 15:23:49 +02:00
url - > m_data - > username = base_url - > m_data - > username ;
url - > m_data - > password = base_url - > m_data - > password ;
url - > m_data - > host = base_url - > m_data - > host ;
url - > m_data - > port = base_url - > m_data - > port ;
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> special authority slashes state, https://url.spec.whatwg.org/#special-authority-slashes-state
2021-05-25 22:13:15 +02:00
case State : : SpecialAuthoritySlashes :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
2022-07-11 17:32:29 +00:00
if ( code_point = = ' / ' & & get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 22:13:15 +02:00
state = State : : SpecialAuthorityIgnoreSlashes ;
+ + iterator ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to special authority ignore slashes state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : SpecialAuthorityIgnoreSlashes ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> special authority ignore slashes state, https://url.spec.whatwg.org/#special-authority-ignore-slashes-state
2021-05-25 22:13:15 +02:00
case State : : SpecialAuthorityIgnoreSlashes :
2023-07-03 22:52:08 +12:00
// 1. If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
if ( code_point ! = ' / ' & & code_point ! = ' \\ ' ) {
state = State : : Authority ;
continue ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> authority state, https://url.spec.whatwg.org/#authority-state
2021-05-25 22:13:15 +02:00
case State : : Authority :
2023-07-03 22:52:08 +12:00
// 1. If c is U+0040 (@), then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' @ ' ) {
2023-07-03 22:52:08 +12:00
// 1. Invalid-credentials validation error.
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If atSignSeen is true, then prepend "%40" to buffer.
2021-05-25 22:13:15 +02:00
if ( at_sign_seen ) {
2023-12-16 17:49:34 +03:30
auto content = buffer . to_byte_string ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2022-07-11 17:32:29 +00:00
buffer . append ( " %40 " sv ) ;
2021-05-25 22:13:15 +02:00
buffer . append ( content ) ;
}
2023-07-03 22:52:08 +12:00
// 3. Set atSignSeen to true.
2021-05-25 22:13:15 +02:00
at_sign_seen = true ;
2023-07-03 22:52:08 +12:00
2023-10-28 06:26:20 +01:00
StringBuilder username_builder ;
StringBuilder password_builder ;
2023-07-03 22:52:08 +12:00
2023-07-04 22:22:01 +12:00
// 4. For each codePoint in buffer:
for ( auto c : Utf8View ( buffer . string_view ( ) ) ) {
2023-07-03 22:52:08 +12:00
// 1. If codePoint is U+003A (:) and passwordTokenSeen is false, then set passwordTokenSeen to true and continue.
2021-05-25 22:13:15 +02:00
if ( c = = ' : ' & & ! password_token_seen ) {
password_token_seen = true ;
continue ;
}
2023-07-03 22:52:08 +12:00
// 2. Let encodedCodePoints be the result of running UTF-8 percent-encode codePoint using the userinfo percent-encode set.
// NOTE: This is done inside of step 3 and 4 implementation
// 3. If passwordTokenSeen is true, then append encodedCodePoints to url’ s password.
2021-05-25 22:13:15 +02:00
if ( password_token_seen ) {
2023-10-28 06:26:20 +01:00
if ( password_builder . is_empty ( ) )
2024-08-02 15:23:49 +02:00
password_builder . append ( url - > m_data - > password ) ;
2023-10-28 06:26:20 +01:00
2024-03-18 16:22:27 +13:00
append_percent_encoded_if_necessary ( password_builder , c , PercentEncodeSet : : Userinfo ) ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, append encodedCodePoints to url’ s username.
else {
2023-10-28 06:26:20 +01:00
if ( username_builder . is_empty ( ) )
2024-08-02 15:23:49 +02:00
username_builder . append ( url - > m_data - > username ) ;
2023-10-28 06:26:20 +01:00
2024-03-18 16:22:27 +13:00
append_percent_encoded_if_necessary ( username_builder , c , PercentEncodeSet : : Userinfo ) ;
2021-05-25 22:13:15 +02:00
}
}
2023-07-03 22:52:08 +12:00
2024-08-02 15:23:49 +02:00
if ( username_builder . string_view ( ) . length ( ) > url - > m_data - > username . bytes ( ) . size ( ) )
url - > m_data - > username = username_builder . to_string ( ) . release_value_but_fixme_should_propagate_errors ( ) ;
if ( password_builder . string_view ( ) . length ( ) > url - > m_data - > password . bytes ( ) . size ( ) )
url - > m_data - > password = password_builder . to_string ( ) . release_value_but_fixme_should_propagate_errors ( ) ;
2023-10-28 06:26:20 +01:00
2023-07-03 22:52:08 +12:00
// 5. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then:
2024-08-10 15:44:41 +12:00
// 1. If atSignSeen is true and buffer is the empty string, host-missing validation error, return failure.
2021-05-25 22:13:15 +02:00
if ( at_sign_seen & & buffer . is_empty ( ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
// 2. Decrease pointer by buffer’ s code point length + 1, set buffer to the empty string, and set state to host state.
2021-05-25 22:13:15 +02:00
iterator = input . iterator_at_byte_offset ( iterator - input . begin ( ) - buffer . length ( ) - 1 ) ;
buffer . clear ( ) ;
state = State : : Host ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, append c to buffer.
else {
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> host state, https://url.spec.whatwg.org/#host-state
// -> hostname state, https://url.spec.whatwg.org/#hostname-state
2021-05-25 22:13:15 +02:00
case State : : Host :
case State : : Hostname :
2023-07-04 20:34:00 +12:00
// 1. If state override is given and url’ s scheme is "file", then decrease pointer by 1 and set state to file host state.
if ( state_override . has_value ( ) & & url - > scheme ( ) = = " file " ) {
state = State : : FileHost ;
continue ;
}
2023-07-03 22:52:08 +12:00
// 2. Otherwise, if c is U+003A (:) and insideBrackets is false, then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' : ' & & ! inside_brackets ) {
2023-07-03 22:52:08 +12:00
// 1. If buffer is the empty string, host-missing validation error, return failure.
2021-05-25 22:13:15 +02:00
if ( buffer . is_empty ( ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 2. If state override is given and state override is hostname state, then return.
if ( state_override . has_value ( ) & & * state_override = = State : : Hostname )
return * url ;
2023-07-03 22:52:08 +12:00
// 3. Let host be the result of host parsing buffer with url is not special.
2021-09-13 22:34:14 +03:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 22:52:08 +12:00
// 4. If host is failure, then return failure.
2021-05-25 22:13:15 +02:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 22:52:08 +12:00
// 5. Set url’ s host to host, buffer to the empty string, and state to port state.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = host . release_value ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : Port ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then decrease pointer by 1, and then:
// NOTE: pointer decrement is done by the continue below
// 1. If url is special and buffer is the empty string, host-missing validation error, return failure.
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & buffer . is_empty ( ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 2. Otherwise, if state override is given, buffer is the empty string, and either url includes credentials or url’ s port is non-null, return.
if ( state_override . has_value ( ) & & buffer . is_empty ( ) & & ( url - > includes_credentials ( ) | | url - > port ( ) . has_value ( ) ) )
return * url ;
2023-07-03 22:52:08 +12:00
// 3. Let host be the result of host parsing buffer with url is not special.
2021-09-13 22:34:14 +03:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 22:52:08 +12:00
// 4. If host is failure, then return failure.
2021-05-25 22:13:15 +02:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 22:52:08 +12:00
// 5. Set url’ s host to host, buffer to the empty string, and state to path start state.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = host . value ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : Port ;
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 6. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2021-05-25 22:13:15 +02:00
continue ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise:
else {
2023-07-04 21:57:05 +12:00
// 1. If c is U+005B ([), then set insideBrackets to true.
if ( code_point = = ' [ ' ) {
inside_brackets = true ;
}
// 2. If c is U+005D (]), then set insideBrackets to false.
else if ( code_point = = ' ] ' ) {
inside_brackets = false ;
}
// 3. Append c to buffer.
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> port state, https://url.spec.whatwg.org/#port-state
2021-05-25 22:13:15 +02:00
case State : : Port :
2023-07-03 22:52:08 +12:00
// 1. If c is an ASCII digit, append c to buffer.
2021-05-25 22:13:15 +02:00
if ( is_ascii_digit ( code_point ) ) {
buffer . append_code_point ( code_point ) ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
2023-07-04 20:34:00 +12:00
// * state override is given
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' )
| | state_override . has_value ( ) ) {
2023-07-03 22:52:08 +12:00
// then:
// 1. If buffer is not the empty string, then:
2021-05-25 22:13:15 +02:00
if ( ! buffer . is_empty ( ) ) {
2023-07-03 22:52:08 +12:00
// 1. Let port be the mathematical integer value that is represented by buffer in radix-10 using ASCII digits for digits with values 0 through 9.
2023-12-23 15:59:14 +13:00
auto port = buffer . string_view ( ) . to_number < u16 > ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If port is greater than 2^16 − 1, port-out-of-range validation error, return failure.
2023-12-23 15:59:14 +13:00
// NOTE: This is done by to_number.
if ( ! port . has_value ( ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
// 3. Set url’ s port to null, if port is url’ s scheme’ s default port; otherwise to port.
2024-03-18 16:22:27 +13:00
if ( port . value ( ) = = default_port_for_scheme ( url - > scheme ( ) ) )
2024-08-02 15:23:49 +02:00
url - > m_data - > port = { } ;
2021-05-25 22:13:15 +02:00
else
2024-08-02 15:23:49 +02:00
url - > m_data - > port = port . value ( ) ;
2023-07-03 22:52:08 +12:00
// 4. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
}
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 2. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2023-07-03 22:52:08 +12:00
// 3. Set state to path start state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : PathStart ;
continue ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, port-invalid validation error, return failure.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> file state, https://url.spec.whatwg.org/#file-state
2021-05-25 22:13:15 +02:00
case State : : File :
2023-07-03 22:52:08 +12:00
// 1. Set url’ s scheme to "file".
2024-08-02 15:23:49 +02:00
url - > m_data - > scheme = " file " _string ;
2023-07-03 22:52:08 +12:00
// 2. Set url’ s host to the empty string.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = String { } ;
2023-07-03 22:52:08 +12:00
// 3. If c is U+002F (/) or U+005C (\), then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' | | code_point = = ' \\ ' ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to file slash state.
2021-05-25 22:13:15 +02:00
state = State : : FileSlash ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if base is non-null and base’ s scheme is "file":
2024-08-02 15:23:49 +02:00
else if ( base_url . has_value ( ) & & base_url - > m_data - > scheme = = " file " ) {
2023-07-03 22:52:08 +12:00
// 1. Set url’ s host to base’ s host, url’ s path to a clone of base’ s path, and url’ s query to base’ s query.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = base_url - > m_data - > host ;
url - > m_data - > paths = base_url - > m_data - > paths ;
url - > m_data - > query = base_url - > m_data - > query ;
2023-07-03 22:52:08 +12:00
// 2. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > query = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > fragment = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set url’ s query to null.
2024-08-02 15:23:49 +02:00
url - > m_data - > query = { } ;
2023-07-03 22:52:08 +12:00
// 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter, then shorten url’ s path.
2021-05-25 22:13:15 +02:00
auto substring_from_pointer = input . substring_view ( iterator - input . begin ( ) ) . as_string ( ) ;
if ( ! starts_with_windows_drive_letter ( substring_from_pointer ) ) {
2023-09-17 13:15:52 +12:00
shorten_urls_path ( * url ) ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise:
else {
// 1. File-invalid-Windows-drive-letter validation error.
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set url’ s path to « ».
2024-08-02 15:23:49 +02:00
url - > m_data - > paths . clear ( ) ;
2021-05-25 22:13:15 +02:00
}
2023-07-03 22:52:08 +12:00
// 4. Set state to path state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
}
2023-07-04 21:06:58 +12:00
// 5. Otherwise, set state to path state, and decrease pointer by 1.
else {
state = State : : Path ;
continue ;
}
2023-07-03 22:52:08 +12:00
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> file slash state, https://url.spec.whatwg.org/#file-slash-state
2021-05-25 22:13:15 +02:00
case State : : FileSlash :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/) or U+005C (\), then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' | | code_point = = ' \\ ' ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to file host state.
2021-05-25 22:13:15 +02:00
state = State : : FileHost ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise:
2023-07-04 21:12:33 +12:00
else {
// 1. If base is non-null and base’ s scheme is "file", then:
2024-08-02 15:23:49 +02:00
if ( base_url . has_value ( ) & & base_url - > m_data - > scheme = = " file " ) {
2023-07-04 21:12:33 +12:00
// 1. Set url’ s host to base’ s host.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = base_url - > m_data - > host ;
2023-09-17 13:47:29 +12:00
2023-07-04 21:12:33 +12:00
// 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter and base’ s path[0] is a normalized Windows drive letter, then append base’ s path[0] to url’ s path.
auto substring_from_pointer = input . substring_view ( iterator - input . begin ( ) ) . as_string ( ) ;
2024-08-02 15:23:49 +02:00
if ( ! starts_with_windows_drive_letter ( substring_from_pointer ) & & is_normalized_windows_drive_letter ( base_url - > m_data - > paths [ 0 ] ) )
url - > m_data - > paths . append ( base_url - > m_data - > paths [ 0 ] ) ;
2023-07-04 21:12:33 +12:00
}
2023-07-03 22:52:08 +12:00
2023-07-04 21:12:33 +12:00
// 2. Set state to path state, and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> file host state, https://url.spec.whatwg.org/#file-host-state
2021-05-25 22:13:15 +02:00
case State : : FileHost :
2023-07-03 22:52:08 +12:00
// 1. If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by 1 and then:
// NOTE: decreasing the pointer is done at the bottom of this block.
2021-06-03 12:43:08 +02:00
if ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' \\ ' | | code_point = = ' ? ' | | code_point = = ' # ' ) {
2023-07-03 22:52:08 +12:00
// 1. If state override is not given and buffer is a Windows drive letter, file-invalid-Windows-drive-letter-host validation error, set state to path state.
2023-07-04 20:34:00 +12:00
if ( ! state_override . has_value ( ) & & is_windows_drive_letter ( buffer . string_view ( ) ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : Path ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if buffer is the empty string, then:
else if ( buffer . is_empty ( ) ) {
// 1. Set url’ s host to the empty string.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = String { } ;
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 2. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2023-07-03 22:52:08 +12:00
// 3. Set state to path start state.
2021-05-25 22:13:15 +02:00
state = State : : PathStart ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, run these steps:
else {
// 1. Let host be the result of host parsing buffer with url is not special.
2024-08-04 13:33:09 +12:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 22:52:08 +12:00
// 2. If host is failure, then return failure.
2021-05-25 22:13:15 +02:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 22:52:08 +12:00
// 3. If host is "localhost", then set host to the empty string.
2023-07-27 21:40:41 +12:00
if ( host . value ( ) . has < String > ( ) & & host . value ( ) . get < String > ( ) = = " localhost " sv )
host = String { } ;
2023-07-03 22:52:08 +12:00
// 4. Set url’ s host to host.
2024-08-02 15:23:49 +02:00
url - > m_data - > host = host . release_value ( ) ;
2023-07-03 22:52:08 +12:00
2023-07-14 12:58:16 +12:00
// 5. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2023-07-03 22:52:08 +12:00
// 6. Set buffer to the empty string and state to path start state.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : PathStart ;
}
2023-07-03 22:52:08 +12:00
// NOTE: Decrement specified at the top of this 'if' statement.
2021-05-25 22:13:15 +02:00
continue ;
} else {
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> path start state, https://url.spec.whatwg.org/#path-start-state
2021-05-25 22:13:15 +02:00
case State : : PathStart :
2023-07-03 22:52:08 +12:00
// 1. If url is special, then:
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to path state.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
2023-07-03 22:52:08 +12:00
// 3. If c is neither U+002F (/) nor U+005C (\), then decrease pointer by 1.
2021-05-25 22:13:15 +02:00
if ( code_point ! = ' / ' & & code_point ! = ' \\ ' )
continue ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if state override is not given and c is U+003F (?), set url’ s query to the empty string and state to query state.
2023-07-04 20:34:00 +12:00
else if ( ! state_override . has_value ( ) & & code_point = = ' ? ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > query = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if state override is not given and c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
2023-07-04 20:34:00 +12:00
else if ( ! state_override . has_value ( ) & & code_point = = ' # ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > fragment = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set state to path state.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
2023-07-03 22:52:08 +12:00
// 2. If c is not U+002F (/), then decrease pointer by 1.
2021-05-25 22:13:15 +02:00
if ( code_point ! = ' / ' )
continue ;
}
2023-07-04 20:34:00 +12:00
// 5. Otherwise, if state override is given and url’ s host is null, append the empty string to url’ s path.
2024-11-27 12:48:28 +00:00
else if ( state_override . has_value ( ) & & ! url - > host ( ) . has_value ( ) ) {
2023-07-04 20:34:00 +12:00
url - > append_slash ( ) ;
}
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> path state, https://url.spec.whatwg.org/#path-state
2021-05-25 22:13:15 +02:00
case State : : Path :
2023-07-03 22:52:08 +12:00
// 1. If one of the following is true:
// * c is the EOF code point or U+002F (/)
// * url is special and c is U+005C (\)
2023-07-04 20:34:00 +12:00
// * state override is not given and c is U+003F (?) or U+0023 (#)
if ( ( code_point = = end_of_file | | code_point = = ' / ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' )
| | ( ! state_override . has_value ( ) & & ( code_point = = ' ? ' | | code_point = = ' # ' ) ) ) {
2023-07-03 22:52:08 +12:00
// then:
// 1. If url is special and c is U+005C (\), invalid-reverse-solidus validation error.
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & code_point = = ' \\ ' )
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If buffer is a double-dot URL path segment, then:
2021-06-08 15:22:02 +02:00
if ( is_double_dot_path_segment ( buffer . string_view ( ) ) ) {
2023-07-04 21:21:33 +12:00
// 1. Shorten url’ s path.
2023-09-17 13:15:52 +12:00
shorten_urls_path ( * url ) ;
2023-07-03 22:52:08 +12:00
// 2. If neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’ s path.
2021-09-13 22:34:14 +03:00
if ( code_point ! = ' / ' & & ! ( url - > is_special ( ) & & code_point = = ' \\ ' ) )
2023-04-09 14:21:00 +01:00
url - > append_slash ( ) ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if buffer is a single-dot URL path segment and if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’ s path.
else if ( is_single_dot_path_segment ( buffer . string_view ( ) ) & & code_point ! = ' / ' & & ! ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
2023-04-09 14:21:00 +01:00
url - > append_slash ( ) ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if buffer is not a single-dot URL path segment, then:
else if ( ! is_single_dot_path_segment ( buffer . string_view ( ) ) ) {
// 1. If url’ s scheme is "file", url’ s path is empty, and buffer is a Windows drive letter, then replace the second code point in buffer with U+003A (:).
2024-08-02 15:23:49 +02:00
if ( url - > m_data - > scheme = = " file " & & url - > m_data - > paths . is_empty ( ) & & is_windows_drive_letter ( buffer . string_view ( ) ) ) {
2021-06-08 15:22:02 +02:00
auto drive_letter = buffer . string_view ( ) [ 0 ] ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
buffer . append ( drive_letter ) ;
buffer . append ( ' : ' ) ;
}
2023-07-03 22:52:08 +12:00
// 2. Append buffer to url’ s path.
2024-08-02 15:23:49 +02:00
url - > m_data - > paths . append ( buffer . to_string_without_validation ( ) ) ;
2021-05-25 22:13:15 +02:00
}
2023-07-03 22:52:08 +12:00
// 5. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 6. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > query = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 7. If c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > fragment = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
}
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, run these steps
else {
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 22:13:15 +02:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
2023-08-13 11:17:02 +12:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 3. UTF-8 percent-encode c using the path percent-encode set and append the result to buffer.
2024-03-18 16:22:27 +13:00
append_percent_encoded_if_necessary ( buffer , code_point , PercentEncodeSet : : Path ) ;
2021-05-25 22:13:15 +02:00
}
break ;
2023-07-03 22:52:08 +12:00
// -> opaque path state, https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
2021-05-25 22:13:15 +02:00
case State : : CannotBeABaseUrlPath :
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
2024-08-02 15:23:49 +02:00
VERIFY ( url - > m_data - > paths . size ( ) = = 1 & & url - > m_data - > paths [ 0 ] . is_empty ( ) ) ;
2023-07-03 22:52:08 +12:00
// 1. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > paths [ 0 ] = buffer . to_string_without_validation ( ) ;
url - > m_data - > query = String { } ;
2023-07-06 19:13:42 +02:00
buffer . clear ( ) ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2021-05-25 22:13:15 +02:00
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
2024-08-02 15:23:49 +02:00
url - > m_data - > paths [ 0 ] = buffer . to_string_without_validation ( ) ;
url - > m_data - > fragment = String { } ;
2023-07-06 19:13:42 +02:00
buffer . clear ( ) ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise:
else {
// 1. If c is not the EOF code point, not a URL code point, and not U+0025 (%), invalid-URL-unit validation error.
2021-06-03 12:43:08 +02:00
if ( code_point ! = end_of_file & & ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
2023-08-13 11:17:02 +12:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 3. If c is not the EOF code point, UTF-8 percent-encode c using the C0 control percent-encode set and append the result to url’ s path.
2021-06-03 12:43:08 +02:00
if ( code_point ! = end_of_file ) {
2024-03-18 16:22:27 +13:00
append_percent_encoded_if_necessary ( buffer , code_point , PercentEncodeSet : : C0Control ) ;
2021-05-25 22:13:15 +02:00
} else {
2024-08-02 15:23:49 +02:00
url - > m_data - > paths [ 0 ] = buffer . to_string_without_validation ( ) ;
2023-07-06 19:13:42 +02:00
buffer . clear ( ) ;
2021-05-25 22:13:15 +02:00
}
}
break ;
2023-07-03 22:52:08 +12:00
// -> query state, https://url.spec.whatwg.org/#query-state
2021-05-25 22:13:15 +02:00
case State : : Query :
2024-10-20 19:42:10 +11:00
// 1. If encoding is not UTF-8 and one of the following is true:
// * url is not special
// * url’ s scheme is "ws" or "wss"
// then set encoding to UTF-8.
if ( ! url - > is_special ( ) | | url - > m_data - > scheme = = " ws " | | url - > m_data - > scheme = = " wss " )
encoder = TextCodec : : encoder_for ( " utf-8 " sv ) ;
2023-07-03 22:52:08 +12:00
// 2. If one of the following is true:
2023-07-04 20:34:00 +12:00
// * state override is not given and c is U+0023 (#)
2023-07-03 22:52:08 +12:00
// * c is the EOF code point
2023-07-04 20:34:00 +12:00
if ( ( ! state_override . has_value ( ) & & code_point = = ' # ' )
| | code_point = = end_of_file ) {
2023-07-03 22:52:08 +12:00
// then:
// 1. Let queryPercentEncodeSet be the special-query percent-encode set if url is special; otherwise the query percent-encode set.
2024-03-18 16:22:27 +13:00
auto query_percent_encode_set = url - > is_special ( ) ? PercentEncodeSet : : SpecialQuery : PercentEncodeSet : : Query ;
2023-07-03 22:52:08 +12:00
// 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’ s query.
2024-08-11 00:05:22 +12:00
url - > m_data - > query = percent_encode_after_encoding ( * encoder , buffer . string_view ( ) , query_percent_encode_set ) ;
2023-07-03 22:52:08 +12:00
// 3. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 4. If c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' # ' ) {
2024-08-02 15:23:49 +02:00
url - > m_data - > fragment = String { } ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
}
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 22:13:15 +02:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
2023-08-13 11:17:02 +12:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 3. Append c to buffer.
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> fragment state, https://url.spec.whatwg.org/#fragment-state
2021-05-25 22:13:15 +02:00
case State : : Fragment :
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
2023-07-03 22:52:08 +12:00
// 1. If c is not the EOF code point, then:
2021-06-03 12:43:08 +02:00
if ( code_point ! = end_of_file ) {
2023-07-03 22:52:08 +12:00
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 22:13:15 +02:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
2023-08-13 11:17:02 +12:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
2023-08-14 16:25:21 +12:00
// 3. UTF-8 percent-encode c using the fragment percent-encode set and append the result to url’ s fragment.
// NOTE: The percent-encode is done on EOF on the entire buffer.
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
} else {
2024-10-20 19:42:10 +11:00
url - > m_data - > fragment = percent_encode_after_encoding ( * TextCodec : : encoder_for ( " utf-8 " sv ) , buffer . string_view ( ) , PercentEncodeSet : : Fragment ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
}
break ;
default :
VERIFY_NOT_REACHED ( ) ;
}
if ( iterator . done ( ) )
break ;
+ + iterator ;
}
2024-08-02 15:23:49 +02:00
url - > m_data - > valid = true ;
2024-08-06 22:29:45 +12:00
dbgln_if ( URL_PARSER_DEBUG , " URL::Parser::basic_parse: Parsed URL to be '{}'. " , url - > serialize ( ) ) ;
2023-07-03 22:52:08 +12:00
// 10. Return url.
2024-08-13 19:18:50 +12:00
return * url ;
2021-05-25 22:13:15 +02:00
}
}