2020-01-18 09:38:21 +01:00
/*
2024-10-04 13:19:50 +02:00
* Copyright ( c ) 2018 - 2020 , Andreas Kling < andreas @ ladybird . org >
2021-05-23 23:31:16 +02:00
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2024-11-26 16:27:08 +00:00
* Copyright ( c ) 2024 , Sam Atkins < sam @ ladybird . org >
2025-02-22 21:52:44 +13:00
* Copyright ( c ) 2023 - 2025 , Shannon Booth < shannon @ serenityos . org >
2020-01-18 09:38:21 +01:00
*
2021-04-22 01:24:48 -07:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-01-18 09:38:21 +01:00
*/
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 19:11:58 +02:00
# include <AK/Base64.h>
2021-06-01 21:18:08 +02:00
# include <AK/CharacterTypes.h>
2021-05-27 21:05:07 +02:00
# include <AK/Debug.h>
2020-05-26 14:52:44 +03:00
# include <AK/LexicalPath.h>
2019-08-10 17:27:56 +02:00
# include <AK/StringBuilder.h>
2021-05-25 13:50:03 +02:00
# include <AK/Utf8View.h>
2024-03-18 16:22:27 +13:00
# include <LibURL/Parser.h>
2025-06-28 20:50:17 +12:00
# include <LibURL/PublicSuffixData.h>
2024-03-18 16:22:27 +13:00
# include <LibURL/URL.h>
2019-08-10 17:27:56 +02:00
2024-03-18 16:22:27 +13:00
namespace URL {
2019-08-10 17:27:56 +02:00
2025-02-15 22:55:46 +13:00
Optional < URL > URL : : complete_url ( StringView relative_url ) const
2019-11-18 22:04:39 +01:00
{
2025-02-15 22:55:46 +13:00
return Parser : : basic_parse ( relative_url , * this ) ;
2019-11-18 22:04:39 +01:00
}
2023-12-16 17:49:34 +03:30
ByteString URL : : path_segment_at_index ( size_t index ) const
2023-04-13 23:29:51 +01:00
{
VERIFY ( index < path_segment_count ( ) ) ;
2024-08-02 15:23:49 +02:00
return percent_decode ( m_data - > paths [ index ] ) ;
2023-04-13 23:29:51 +01:00
}
2023-12-16 17:49:34 +03:30
ByteString URL : : basename ( ) const
2023-04-13 23:06:58 +01:00
{
2024-08-02 15:23:49 +02:00
if ( m_data - > paths . is_empty ( ) )
2023-04-13 23:06:58 +01:00
return { } ;
2024-08-02 15:23:49 +02:00
auto & last_segment = m_data - > paths . last ( ) ;
2023-08-06 16:43:50 +12:00
return percent_decode ( last_segment ) ;
2023-04-13 23:06:58 +01:00
}
2023-08-12 16:52:41 +12:00
void URL : : set_scheme ( String scheme )
2020-04-11 23:07:23 +02:00
{
2024-08-02 15:23:49 +02:00
m_data - > scheme = move ( scheme ) ;
2020-04-11 23:07:23 +02:00
}
2023-08-06 16:32:44 +12:00
// https://url.spec.whatwg.org/#set-the-username
2024-08-10 13:12:19 +12:00
void URL : : set_username ( StringView username )
2021-05-25 21:32:20 +02:00
{
2023-08-06 16:32:44 +12:00
// To set the username given a url and username, set url’ s username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
2024-08-10 13:12:19 +12:00
m_data - > username = percent_encode ( username , PercentEncodeSet : : Userinfo ) ;
2021-05-25 21:32:20 +02:00
}
2023-08-06 16:32:44 +12:00
// https://url.spec.whatwg.org/#set-the-password
2024-08-10 13:12:19 +12:00
void URL : : set_password ( StringView password )
2021-05-25 21:32:20 +02:00
{
2023-08-06 16:32:44 +12:00
// To set the password given a url and password, set url’ s password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
2024-08-10 13:12:19 +12:00
m_data - > password = percent_encode ( password , PercentEncodeSet : : Userinfo ) ;
2021-05-25 21:32:20 +02:00
}
2023-07-27 21:40:41 +12:00
void URL : : set_host ( Host host )
2020-04-11 23:07:23 +02:00
{
2024-08-02 15:23:49 +02:00
m_data - > host = move ( host ) ;
2020-04-11 23:07:23 +02:00
}
2023-07-27 21:40:41 +12:00
// https://url.spec.whatwg.org/#concept-host-serializer
2024-11-28 14:32:07 +00:00
String URL : : serialized_host ( ) const
2023-07-27 21:40:41 +12:00
{
2024-11-27 15:12:17 +00:00
return m_data - > host - > serialize ( ) ;
2023-07-27 21:40:41 +12:00
}
2021-09-13 23:12:16 +03:00
void URL : : set_port ( Optional < u16 > port )
2020-11-04 06:20:20 +00:00
{
2024-08-02 15:23:49 +02:00
if ( port = = default_port_for_scheme ( m_data - > scheme ) ) {
m_data - > port = { } ;
2021-05-25 21:32:20 +02:00
return ;
}
2024-08-02 15:23:49 +02:00
m_data - > port = move ( port ) ;
2020-11-04 06:20:20 +00:00
}
2023-12-16 17:49:34 +03:30
void URL : : set_paths ( Vector < ByteString > const & paths )
2021-05-25 21:32:20 +02:00
{
2024-08-02 15:23:49 +02:00
m_data - > paths . clear_with_capacity ( ) ;
m_data - > paths . ensure_capacity ( paths . size ( ) ) ;
2023-08-06 16:32:44 +12:00
for ( auto const & segment : paths )
2024-08-10 13:12:19 +12:00
m_data - > paths . unchecked_append ( percent_encode ( segment , PercentEncodeSet : : Path ) ) ;
2021-05-25 21:32:20 +02:00
}
2025-08-16 16:34:43 +03:00
void URL : : set_raw_paths ( Vector < String > paths )
{
m_data - > paths = move ( paths ) ;
}
2023-08-06 16:32:44 +12:00
void URL : : append_path ( StringView path )
2023-04-09 14:21:00 +01:00
{
2024-08-10 13:12:19 +12:00
m_data - > paths . append ( percent_encode ( path , PercentEncodeSet : : Path ) ) ;
2023-04-09 14:21:00 +01:00
}
2023-07-26 20:54:36 +12:00
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
bool URL : : cannot_have_a_username_or_password_or_port ( ) const
{
// A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
2024-11-27 15:12:17 +00:00
return ! m_data - > host . has_value ( ) | | m_data - > host - > is_empty_host ( ) | | m_data - > scheme = = " file " sv ;
2023-07-26 20:54:36 +12:00
}
2023-07-31 20:23:53 +12:00
// https://url.spec.whatwg.org/#default-port
2024-03-18 16:22:27 +13:00
Optional < u16 > default_port_for_scheme ( StringView scheme )
2020-11-04 06:20:20 +00:00
{
2023-07-31 20:23:53 +12:00
// Spec defined mappings with port:
if ( scheme = = " ftp " )
return 21 ;
2021-05-23 23:31:16 +02:00
if ( scheme = = " http " )
2020-11-04 06:20:20 +00:00
return 80 ;
2021-05-23 23:31:16 +02:00
if ( scheme = = " https " )
2020-11-04 06:20:20 +00:00
return 443 ;
2023-07-31 20:23:53 +12:00
if ( scheme = = " ws " )
return 80 ;
if ( scheme = = " wss " )
return 443 ;
// NOTE: not in spec, but we support these too
2021-05-23 23:31:16 +02:00
if ( scheme = = " irc " )
2020-11-04 06:20:20 +00:00
return 6667 ;
2021-05-23 23:31:16 +02:00
if ( scheme = = " ircs " )
2020-11-04 06:20:20 +00:00
return 6697 ;
2023-07-31 20:23:53 +12:00
2023-08-13 11:00:56 +12:00
return { } ;
2020-11-04 06:20:20 +00:00
}
2025-04-19 16:43:17 +12:00
Optional < URL > create_with_file_scheme ( ByteString const & path , ByteString const & fragment , ByteString const & hostname )
2020-04-18 22:02:04 +02:00
{
2021-05-27 21:40:02 +02:00
LexicalPath lexical_path ( path ) ;
2021-06-29 13:11:03 +02:00
if ( ! lexical_path . is_absolute ( ) )
2021-05-27 21:40:02 +02:00
return { } ;
2021-05-29 21:57:20 +02:00
2025-04-19 16:41:19 +12:00
StringBuilder url_builder ;
url_builder . append ( " file:// " sv ) ;
url_builder . append ( hostname ) ;
url_builder . append ( lexical_path . string ( ) ) ;
2022-03-23 22:46:52 -04:00
if ( path . ends_with ( ' / ' ) )
2025-04-19 16:41:19 +12:00
url_builder . append ( ' / ' ) ;
if ( ! fragment . is_empty ( ) ) {
url_builder . append ( ' # ' ) ;
url_builder . append ( fragment ) ;
}
2025-04-19 16:43:17 +12:00
return Parser : : basic_parse ( url_builder . string_view ( ) ) ;
2022-03-23 22:46:52 -04:00
}
2025-04-19 16:43:17 +12:00
Optional < URL > create_with_url_or_path ( ByteString const & url_or_path )
2020-04-19 11:55:59 +03:00
{
2025-02-16 14:45:52 +13:00
auto url = Parser : : basic_parse ( url_or_path ) ;
if ( url . has_value ( ) )
return url . release_value ( ) ;
2020-04-19 11:55:59 +03:00
2023-12-16 17:49:34 +03:30
ByteString path = LexicalPath : : canonicalized_path ( url_or_path ) ;
2024-03-18 16:22:27 +13:00
return create_with_file_scheme ( path ) ;
2020-04-19 11:55:59 +03:00
}
2024-03-18 16:22:27 +13:00
URL create_with_data ( StringView mime_type , StringView payload , bool is_base64 )
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 19:11:58 +02:00
{
URL url ;
2025-03-07 19:08:44 +13:00
url . set_has_an_opaque_path ( true ) ;
2023-08-12 16:52:41 +12:00
url . set_scheme ( " data " _string ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 19:11:58 +02:00
StringBuilder builder ;
builder . append ( mime_type ) ;
if ( is_base64 )
builder . append ( " ;base64 " sv ) ;
builder . append ( ' , ' ) ;
builder . append ( payload ) ;
2023-12-16 17:49:34 +03:30
url . set_paths ( { builder . to_byte_string ( ) } ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 19:11:58 +02:00
return url ;
}
2021-05-25 22:05:01 +02:00
// https://url.spec.whatwg.org/#special-scheme
2025-03-18 19:22:16 +13:00
ReadonlySpan < StringView > special_schemes ( )
{
static auto const schemes = to_array < StringView > ( {
" ftp " sv ,
" file " sv ,
" http " sv ,
" https " sv ,
" ws " sv ,
" wss " sv ,
} ) ;
return schemes ;
}
// https://url.spec.whatwg.org/#is-special
2024-03-18 16:22:27 +13:00
bool is_special_scheme ( StringView scheme )
2021-05-25 22:05:01 +02:00
{
2025-03-18 19:22:16 +13:00
return special_schemes ( ) . contains_slow ( scheme ) ;
2021-05-25 22:05:01 +02:00
}
2023-09-12 08:50:15 -07:00
// https://url.spec.whatwg.org/#url-path-serializer
2024-08-05 16:55:39 +12:00
String URL : : serialize_path ( ) const
2023-04-14 20:12:03 +01:00
{
2023-09-19 09:45:12 -07:00
// 1. If url has an opaque path, then return url’ s path.
2025-03-07 19:08:44 +13:00
if ( has_an_opaque_path ( ) )
2024-08-05 16:55:39 +12:00
return m_data - > paths [ 0 ] ;
2023-09-12 08:50:15 -07:00
2023-09-19 09:45:12 -07:00
// 2. Let output be the empty string.
2023-09-12 08:50:15 -07:00
StringBuilder output ;
2023-09-19 09:45:12 -07:00
// 3. For each segment of url’ s path: append U+002F (/) followed by segment to output.
2024-08-02 15:23:49 +02:00
for ( auto const & segment : m_data - > paths ) {
2023-09-12 08:50:15 -07:00
output . append ( ' / ' ) ;
2024-08-05 16:55:39 +12:00
output . append ( segment ) ;
2023-04-14 20:12:03 +01:00
}
2023-09-12 08:50:15 -07:00
2023-09-19 09:45:12 -07:00
// 4. Return output.
2024-08-05 16:55:39 +12:00
return output . to_string_without_validation ( ) ;
2023-04-14 20:12:03 +01:00
}
2025-02-14 15:31:43 +05:00
// This function is used whenever a path is needed to access the actual file on disk.
// On Windows serialize_path can produce a path like /C:/path/to/tst.htm, so the leading slash needs to be removed to obtain a valid path.
ByteString URL : : file_path ( ) const
{
ByteString path = percent_decode ( serialize_path ( ) ) ;
# ifdef AK_OS_WINDOWS
if ( path . starts_with ( ' / ' ) )
path = path . substring ( 1 ) ;
# endif
return path ;
}
2021-05-25 22:32:39 +02:00
// https://url.spec.whatwg.org/#concept-url-serializer
2024-12-03 22:31:33 +13:00
String URL : : serialize ( ExcludeFragment exclude_fragment ) const
2021-05-25 22:32:39 +02:00
{
2023-07-25 20:04:09 +12:00
// 1. Let output be url’ s scheme and U+003A (:) concatenated.
StringBuilder output ;
2024-08-02 15:23:49 +02:00
output . append ( m_data - > scheme ) ;
2023-07-25 20:04:09 +12:00
output . append ( ' : ' ) ;
// 2. If url’ s host is non-null:
2024-11-27 12:48:28 +00:00
if ( m_data - > host . has_value ( ) ) {
2023-07-25 20:04:09 +12:00
// 1. Append "//" to output.
output . append ( " // " sv ) ;
2021-05-25 22:32:39 +02:00
2023-07-25 20:04:09 +12:00
// 2. If url includes credentials, then:
2021-05-25 22:32:39 +02:00
if ( includes_credentials ( ) ) {
2023-07-25 20:04:09 +12:00
// 1. Append url’ s username to output.
2024-08-02 15:23:49 +02:00
output . append ( m_data - > username ) ;
2023-07-25 20:04:09 +12:00
// 2. If url’ s password is not the empty string, then append U+003A (:), followed by url’ s password, to output.
2024-08-02 15:23:49 +02:00
if ( ! m_data - > password . is_empty ( ) ) {
2023-07-25 20:04:09 +12:00
output . append ( ' : ' ) ;
2024-08-02 15:23:49 +02:00
output . append ( m_data - > password ) ;
2021-05-25 22:32:39 +02:00
}
2023-07-25 20:04:09 +12:00
// 3. Append U+0040 (@) to output.
output . append ( ' @ ' ) ;
2021-05-25 22:32:39 +02:00
}
2023-07-25 20:04:09 +12:00
// 3. Append url’ s host, serialized, to output.
2024-11-28 14:32:07 +00:00
output . append ( serialized_host ( ) ) ;
2023-07-25 20:04:09 +12:00
// 4. If url’ s port is non-null, append U+003A (:) followed by url’ s port, serialized, to output.
2024-08-02 15:23:49 +02:00
if ( m_data - > port . has_value ( ) )
output . appendff ( " :{} " , * m_data - > port ) ;
2021-05-25 22:32:39 +02:00
}
2023-07-25 20:04:09 +12:00
// 3. If url’ s host is null, url does not have an opaque path, url’ s path’ s size is greater than 1, and url’ s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
// 4. Append the result of URL path serializing url to output.
// FIXME: Implement this closer to spec steps.
2025-03-07 19:08:44 +13:00
if ( has_an_opaque_path ( ) ) {
2024-08-02 15:23:49 +02:00
output . append ( m_data - > paths [ 0 ] ) ;
2021-05-25 22:32:39 +02:00
} else {
2024-11-27 12:48:28 +00:00
if ( ! m_data - > host . has_value ( ) & & m_data - > paths . size ( ) > 1 & & m_data - > paths [ 0 ] . is_empty ( ) )
2023-07-25 20:04:09 +12:00
output . append ( " /. " sv ) ;
2024-08-02 15:23:49 +02:00
for ( auto & segment : m_data - > paths ) {
2023-07-25 20:04:09 +12:00
output . append ( ' / ' ) ;
output . append ( segment ) ;
2021-05-25 22:32:39 +02:00
}
}
2023-07-25 20:04:09 +12:00
// 5. If url’ s query is non-null, append U+003F (?), followed by url’ s query, to output.
2024-08-02 15:23:49 +02:00
if ( m_data - > query . has_value ( ) ) {
2023-07-25 20:04:09 +12:00
output . append ( ' ? ' ) ;
2024-08-02 15:23:49 +02:00
output . append ( * m_data - > query ) ;
2021-05-25 22:32:39 +02:00
}
2023-07-25 20:04:09 +12:00
// 6. If exclude fragment is false and url’ s fragment is non-null, then append U+0023 (#), followed by url’ s fragment, to output.
2024-08-02 15:23:49 +02:00
if ( exclude_fragment = = ExcludeFragment : : No & & m_data - > fragment . has_value ( ) ) {
2023-07-25 20:04:09 +12:00
output . append ( ' # ' ) ;
2024-08-02 15:23:49 +02:00
output . append ( * m_data - > fragment ) ;
2021-05-25 22:32:39 +02:00
}
2023-07-25 20:04:09 +12:00
// 7. Return output.
2024-12-03 22:31:33 +13:00
return output . to_string_without_validation ( ) ;
2021-05-25 22:32:39 +02:00
}
// https://url.spec.whatwg.org/#url-rendering
// NOTE: This does e.g. not display credentials.
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
2023-12-16 17:49:34 +03:30
ByteString URL : : serialize_for_display ( ) const
2021-05-25 22:32:39 +02:00
{
StringBuilder builder ;
2024-08-02 15:23:49 +02:00
builder . append ( m_data - > scheme ) ;
2021-05-25 22:32:39 +02:00
builder . append ( ' : ' ) ;
2024-11-27 12:48:28 +00:00
if ( m_data - > host . has_value ( ) ) {
2022-07-11 17:32:29 +00:00
builder . append ( " // " sv ) ;
2024-11-28 14:32:07 +00:00
builder . append ( serialized_host ( ) ) ;
2024-08-02 15:23:49 +02:00
if ( m_data - > port . has_value ( ) )
builder . appendff ( " :{} " , * m_data - > port ) ;
2021-05-25 22:32:39 +02:00
}
2025-03-07 19:08:44 +13:00
if ( has_an_opaque_path ( ) ) {
2024-08-02 15:23:49 +02:00
builder . append ( m_data - > paths [ 0 ] ) ;
2021-05-25 22:32:39 +02:00
} else {
2024-11-27 12:48:28 +00:00
if ( ! m_data - > host . has_value ( ) & & m_data - > paths . size ( ) > 1 & & m_data - > paths [ 0 ] . is_empty ( ) )
2022-07-11 17:32:29 +00:00
builder . append ( " /. " sv ) ;
2024-08-02 15:23:49 +02:00
for ( auto & segment : m_data - > paths ) {
2021-05-27 21:40:02 +02:00
builder . append ( ' / ' ) ;
2023-04-09 14:21:00 +01:00
builder . append ( segment ) ;
2021-05-25 22:32:39 +02:00
}
}
2024-08-02 15:23:49 +02:00
if ( m_data - > query . has_value ( ) ) {
2021-05-25 22:32:39 +02:00
builder . append ( ' ? ' ) ;
2024-08-02 15:23:49 +02:00
builder . append ( * m_data - > query ) ;
2021-05-25 22:32:39 +02:00
}
2024-08-02 15:23:49 +02:00
if ( m_data - > fragment . has_value ( ) ) {
2021-05-25 22:32:39 +02:00
builder . append ( ' # ' ) ;
2024-08-02 15:23:49 +02:00
builder . append ( * m_data - > fragment ) ;
2021-05-25 22:32:39 +02:00
}
2023-12-16 17:49:34 +03:30
return builder . to_byte_string ( ) ;
2021-05-25 22:32:39 +02:00
}
2021-09-13 22:18:14 +03:00
// https://url.spec.whatwg.org/#concept-url-origin
2024-10-05 17:03:51 +13:00
Origin URL : : origin ( ) const
2021-09-13 22:18:14 +03:00
{
2024-10-05 17:03:51 +13:00
// The origin of a URL url is the origin returned by running these steps, switching on url’ s scheme:
// -> "blob"
if ( scheme ( ) = = " blob " sv ) {
// 1. If url’ s blob URL entry is non-null, then return url’ s blob URL entry’ s environment’ s origin.
if ( blob_url_entry ( ) . has_value ( ) )
2025-01-19 18:12:46 +13:00
return blob_url_entry ( ) - > environment . origin ;
2024-10-05 17:03:51 +13:00
// 2. Let pathURL be the result of parsing the result of URL path serializing url.
auto path_url = Parser : : basic_parse ( serialize_path ( ) ) ;
2021-09-13 22:18:14 +03:00
2024-10-05 17:03:51 +13:00
// 3. If pathURL is failure, then return a new opaque origin.
2025-01-10 04:50:34 +13:00
if ( ! path_url . has_value ( ) )
2025-06-15 19:08:58 +12:00
return Origin : : create_opaque ( ) ;
2024-10-05 17:03:51 +13:00
// 4. If pathURL’ s scheme is "http", "https", or "file", then return pathURL’ s origin.
2025-01-10 04:50:34 +13:00
if ( path_url - > scheme ( ) . is_one_of ( " http " sv , " https " sv , " file " sv ) )
return path_url - > origin ( ) ;
2024-10-05 17:03:51 +13:00
// 5. Return a new opaque origin.
2025-06-15 19:08:58 +12:00
return Origin : : create_opaque ( ) ;
2021-09-13 22:18:14 +03:00
}
2024-10-05 17:03:51 +13:00
// -> "ftp"
// -> "http"
// -> "https"
// -> "ws"
// -> "wss"
if ( scheme ( ) . is_one_of ( " ftp " sv , " http " sv , " https " sv , " ws " sv , " wss " sv ) ) {
// Return the tuple origin (url’ s scheme, url’ s host, url’ s port, null).
2024-11-27 16:18:42 +00:00
return Origin ( scheme ( ) , host ( ) . value ( ) , port ( ) ) ;
2024-10-05 17:03:51 +13:00
}
// -> "file"
// AD-HOC: Our resource:// is basically an alias to file://
if ( scheme ( ) = = " file " sv | | scheme ( ) = = " resource " sv ) {
// Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin.
// Note: We must return an origin with the `file://' protocol for `file://' iframes to work from `file://' pages.
2024-11-27 16:18:42 +00:00
return Origin ( scheme ( ) , String { } , { } ) ;
2024-10-05 17:03:51 +13:00
}
// -> Otherwise
// Return a new opaque origin.
2025-06-15 19:08:58 +12:00
return Origin : : create_opaque ( ) ;
2021-09-13 22:18:14 +03:00
}
2021-06-01 10:58:27 +02:00
bool URL : : equals ( URL const & other , ExcludeFragment exclude_fragments ) const
2021-05-27 21:38:16 +02:00
{
2021-06-01 11:14:30 +02:00
if ( this = = & other )
return true ;
2021-05-27 21:38:16 +02:00
return serialize ( exclude_fragments ) = = other . serialize ( exclude_fragments ) ;
}
2024-03-18 16:22:27 +13:00
void append_percent_encoded ( StringBuilder & builder , u32 code_point )
2021-05-25 13:50:03 +02:00
{
if ( code_point < = 0x7f )
builder . appendff ( " %{:02X} " , code_point ) ;
else if ( code_point < = 0x07ff )
builder . appendff ( " %{:02X}%{:02X} " , ( ( code_point > > 6 ) & 0x1f ) | 0xc0 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0xffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X} " , ( ( code_point > > 12 ) & 0x0f ) | 0xe0 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0x10ffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X}%{:02X} " , ( ( code_point > > 18 ) & 0x07 ) | 0xf0 , ( ( code_point > > 12 ) & 0x3f ) | 0x80 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else
VERIFY_NOT_REACHED ( ) ;
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
2024-03-18 16:22:27 +13:00
bool code_point_is_in_percent_encode_set ( u32 code_point , PercentEncodeSet set )
2021-05-25 13:50:03 +02:00
{
2023-12-29 17:15:11 +01:00
// NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
// a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
2021-05-25 13:50:03 +02:00
switch ( set ) {
2024-03-18 16:22:27 +13:00
case PercentEncodeSet : : C0Control :
2021-05-25 13:50:03 +02:00
return code_point < 0x20 | | code_point > 0x7E ;
2024-03-18 16:22:27 +13:00
case PercentEncodeSet : : Fragment :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : C0Control ) | | " \" <>` " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : Query :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : C0Control ) | | " \" #<> " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : SpecialQuery :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Query ) | | code_point = = ' \' ' ;
case PercentEncodeSet : : Path :
2025-03-10 14:24:15 +05:30
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Query ) | | " ?^` { } " sv.contains(static_cast<char>(code_point));
2024-03-18 16:22:27 +13:00
case PercentEncodeSet : : Userinfo :
2025-03-10 14:24:15 +05:30
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Path ) | | " /: ; = @ [ \ \ ] | " sv.contains(static_cast<char>(code_point));
2024-03-18 16:22:27 +13:00
case PercentEncodeSet : : Component :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Userinfo ) | | " $%&+, " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : ApplicationXWWWFormUrlencoded :
return code_point_is_in_percent_encode_set ( code_point , PercentEncodeSet : : Component ) | | " !'()~ " sv . contains ( static_cast < char > ( code_point ) ) ;
case PercentEncodeSet : : EncodeURI :
2021-05-25 13:50:03 +02:00
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
2022-12-25 14:25:34 -05:00
return code_point > 0x7E | | ( ! is_ascii_alphanumeric ( code_point ) & & ! " ;,/?:@&=+$-_.!~*'()# " sv . contains ( static_cast < char > ( code_point ) ) ) ;
2021-05-25 13:50:03 +02:00
default :
VERIFY_NOT_REACHED ( ) ;
}
}
2024-03-18 16:22:27 +13:00
void append_percent_encoded_if_necessary ( StringBuilder & builder , u32 code_point , PercentEncodeSet set )
2021-05-25 13:50:03 +02:00
{
2022-04-08 14:20:30 +01:00
if ( code_point_is_in_percent_encode_set ( code_point , set ) )
2021-05-25 13:50:03 +02:00
append_percent_encoded ( builder , code_point ) ;
else
builder . append_code_point ( code_point ) ;
}
2024-08-10 13:12:19 +12:00
String percent_encode ( StringView input , PercentEncodeSet set , SpaceAsPlus space_as_plus )
2021-05-25 13:50:03 +02:00
{
StringBuilder builder ;
for ( auto code_point : Utf8View ( input ) ) {
2022-04-09 18:34:49 +02:00
if ( space_as_plus = = SpaceAsPlus : : Yes & & code_point = = ' ' )
builder . append ( ' + ' ) ;
else
append_percent_encoded_if_necessary ( builder , code_point , set ) ;
2021-05-25 13:50:03 +02:00
}
2024-08-10 13:12:19 +12:00
return MUST ( builder . to_string ( ) ) ;
2021-05-25 13:50:03 +02:00
}
2025-02-15 23:45:40 +13:00
URL URL : : about ( String path )
{
URL url ;
url . m_data - > scheme = " about " _string ;
url . m_data - > paths = { move ( path ) } ;
2025-03-07 19:08:44 +13:00
url . m_data - > has_an_opaque_path = true ;
2025-02-15 23:45:40 +13:00
return url ;
}
2024-12-01 21:37:29 +13:00
// https://url.spec.whatwg.org/#percent-decode
2024-03-18 16:22:27 +13:00
ByteString percent_decode ( StringView input )
2021-05-25 13:50:03 +02:00
{
if ( ! input . contains ( ' % ' ) )
return input ;
2024-12-01 21:37:29 +13:00
// 1. Let output be an empty byte sequence.
2021-05-25 13:50:03 +02:00
StringBuilder builder ;
2024-12-01 21:37:29 +13:00
// 2. For each byte byte in input:
for ( size_t i = 0 ; i < input . length ( ) ; + + i ) {
// 1. If byte is not 0x25 (%), then append byte to output.
if ( input [ i ] ! = ' % ' ) {
builder . append ( input [ i ] ) ;
}
// 2. Otherwise, if byte is 0x25 (%) and the next two bytes after byte in input are not in the ranges 0x30 (0)
// to 0x39 (9), 0x41 (A) to 0x46 (F), and 0x61 (a) to 0x66 (f), all inclusive, append byte to output.
else if ( i + 2 > = input . length ( ) | | ! is_ascii_hex_digit ( input [ i + 1 ] ) | | ! is_ascii_hex_digit ( input [ i + 2 ] ) ) {
builder . append ( input [ i ] ) ;
}
// 3. Otherwise:
else {
// 1. Let bytePoint be the two bytes after byte in input, decoded, and then interpreted as hexadecimal number.
u8 byte_point = ( parse_ascii_hex_digit ( input [ i + 1 ] ) < < 4 ) | parse_ascii_hex_digit ( input [ i + 2 ] ) ;
// 2. Append a byte whose value is bytePoint to output.
builder . append ( byte_point ) ;
// 3. Skip the next two bytes in input.
i + = 2 ;
2021-05-25 13:50:03 +02:00
}
}
2023-12-16 17:49:34 +03:30
return builder . to_byte_string ( ) ;
2021-05-25 13:50:03 +02:00
}
2025-06-28 20:50:17 +12:00
bool is_public_suffix ( StringView host )
2024-11-26 16:27:08 +00:00
{
return PublicSuffixData : : the ( ) - > is_public_suffix ( host ) ;
}
2025-03-09 11:11:35 -04:00
// https://github.com/publicsuffix/list/wiki/Format#algorithm
Optional < String > get_registrable_domain ( StringView host )
{
// The registered or registrable domain is the public suffix plus one additional label.
2025-06-28 21:05:42 +12:00
auto public_suffix = PublicSuffixData : : the ( ) - > get_public_suffix ( host ) ;
2025-03-09 11:11:35 -04:00
if ( ! public_suffix . has_value ( ) | | ! host . ends_with ( * public_suffix ) )
return { } ;
if ( host = = * public_suffix )
return { } ;
auto subhost = host . substring_view ( 0 , host . length ( ) - public_suffix - > bytes_as_string_view ( ) . length ( ) ) ;
subhost = subhost . trim ( " . " sv , TrimMode : : Right ) ;
if ( subhost . is_empty ( ) )
return { } ;
size_t start_index = 0 ;
if ( auto index = subhost . find_last ( ' . ' ) ; index . has_value ( ) )
start_index = * index + 1 ;
return MUST ( String : : from_utf8 ( host . substring_view ( start_index ) ) ) ;
}
2019-08-10 17:27:56 +02:00
}