2023-07-23 19:38:04 +03:00
/*
* Copyright ( c ) 2023 , Jonatan Klemets < jonatan . r . klemets @ gmail . com >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
# include <AK/GenericLexer.h>
2025-06-26 19:06:46 -04:00
# include <AK/StringConversions.h>
2025-07-26 12:19:56 -04:00
# include <AK/Utf16String.h>
2023-07-23 19:38:04 +03:00
# include <LibWeb/HTML/Numbers.h>
# include <LibWeb/Infra/CharacterTypes.h>
2023-11-15 19:54:01 +01:00
# include <math.h>
2023-07-23 19:38:04 +03:00
namespace Web : : HTML {
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-integers
2024-12-01 00:04:32 +00:00
Optional < StringView > parse_integer_digits ( StringView string )
2023-07-23 19:38:04 +03:00
{
// 1. Let input be the string being parsed.
// 2. Let position be a pointer into input, initially pointing at the start of the string.
GenericLexer lexer { string } ;
// 3. Let sign have the value "positive".
// NOTE: Skipped, see comment on step 6.
// 4. Skip ASCII whitespace within input given position.
lexer . ignore_while ( Web : : Infra : : is_ascii_whitespace ) ;
// 5. If position is past the end of input, return an error.
if ( lexer . is_eof ( ) ) {
2024-12-01 00:04:32 +00:00
return OptionalNone { } ;
2023-07-23 19:38:04 +03:00
}
// 6. If the character indicated by position (the first character) is a U+002D HYPHEN-MINUS character (-):
//
// If we parse a signed integer, then we include the sign character (if present) in the collect step
// (step 8) and lean on `AK::StringUtils::convert_to_int` to handle it for us.
size_t start_index = lexer . tell ( ) ;
if ( lexer . peek ( ) = = ' - ' | | lexer . peek ( ) = = ' + ' ) {
lexer . consume ( ) ;
}
// 7. If the character indicated by position is not an ASCII digit, then return an error.
if ( ! lexer . next_is ( is_ascii_digit ) ) {
2024-12-01 00:04:32 +00:00
return OptionalNone { } ;
2023-07-23 19:38:04 +03:00
}
// 8. Collect a sequence of code points that are ASCII digits from input given position, and interpret the resulting sequence as a base-ten integer. Let value be that integer.
2024-12-01 00:04:32 +00:00
// NOTE: Integer conversion is performed by the caller.
2023-07-23 19:38:04 +03:00
lexer . consume_while ( is_ascii_digit ) ;
size_t end_index = lexer . tell ( ) ;
auto digits = lexer . input ( ) . substring_view ( start_index , end_index - start_index ) ;
// 9. If sign is "positive", return value, otherwise return the result of subtracting value from zero.
// NOTE: Skipped, see comment on step 6.
2024-12-01 00:04:32 +00:00
return digits ;
}
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-integers
Optional < i32 > parse_integer ( StringView string )
{
auto optional_digits = parse_integer_digits ( string ) ;
if ( ! optional_digits . has_value ( ) )
return { } ;
return optional_digits - > to_number < i32 > ( TrimWhitespace : : No ) ;
2023-07-23 19:38:04 +03:00
}
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-non-negative-integers
2024-12-01 00:04:32 +00:00
Optional < StringView > parse_non_negative_integer_digits ( StringView string )
2023-07-23 19:38:04 +03:00
{
// 1. Let input be the string being parsed.
// 2. Let value be the result of parsing input using the rules for parsing integers.
//
// NOTE: Because we call `parse_integer`, we parse all integers as signed. If we need the extra
// size that an unsigned integer offers, then this would need to be improved. That said,
// I don't think we need to support such large integers at the moment.
2024-12-01 00:04:32 +00:00
auto optional_integer_digits = parse_integer_digits ( string ) ;
2023-07-23 19:38:04 +03:00
// 3. If value is an error, return an error.
2024-12-01 00:04:32 +00:00
if ( ! optional_integer_digits . has_value ( ) )
return OptionalNone { } ;
2023-07-23 19:38:04 +03:00
// 4. If value is less than zero, return an error.
2024-12-01 00:04:32 +00:00
if ( optional_integer_digits - > length ( ) > 1 & & optional_integer_digits - > starts_with ( ' - ' ) & & optional_integer_digits - > bytes ( ) . at ( 1 ) ! = ' 0 ' )
return OptionalNone { } ;
2023-07-23 19:38:04 +03:00
// 5. Return value.
2024-12-01 00:04:32 +00:00
// NOTE: Integer conversion is performed by the caller.
return optional_integer_digits ;
}
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-non-negative-integers
Optional < u32 > parse_non_negative_integer ( StringView string )
{
auto optional_digits = parse_non_negative_integer_digits ( string ) ;
if ( ! optional_digits . has_value ( ) )
return { } ;
auto optional_value = optional_digits - > to_number < i64 > ( TrimWhitespace : : No ) ;
if ( ! optional_value . has_value ( ) | | * optional_value > NumericLimits < u32 > : : max ( ) )
return { } ;
2023-07-23 19:38:04 +03:00
return static_cast < u32 > ( optional_value . value ( ) ) ;
}
2023-11-15 19:54:01 +01:00
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-floating-point-number-values
Optional < double > parse_floating_point_number ( StringView string )
{
2024-10-25 14:25:04 +02:00
// 1. Let input be the string being parsed.
// 2. Let position be a pointer into input, initially pointing at the start of the string.
GenericLexer lexer { string } ;
// 3. Let value have the value 1.
double value = 1 ;
// 4. Let divisor have the value 1.
double divisor = 1 ;
// 5. Let exponent have the value 1.
i16 exponent = 1 ;
// 6. Skip ASCII whitespace within input given position.
lexer . ignore_while ( Web : : Infra : : is_ascii_whitespace ) ;
// 7. If position is past the end of input, return an error.
if ( lexer . is_eof ( ) ) {
return { } ;
}
// 8. If the character indicated by position is a U+002D HYPHEN-MINUS character (-):
if ( lexer . next_is ( ' - ' ) ) {
// 8.1. Change value and divisor to − 1.
value = - 1 ;
divisor = - 1 ;
// 8.2. Advance position to the next character.
lexer . consume ( ) ;
// 8.3. If position is past the end of input, return an error.
if ( lexer . is_eof ( ) ) {
return { } ;
}
}
// Otherwise, if the character indicated by position (the first character) is a U+002B PLUS SIGN character (+):
else if ( lexer . next_is ( ' + ' ) ) {
// 8.1. Advance position to the next character. (The "+" is ignored, but it is not conforming.)
lexer . consume ( ) ;
// 8.2. If position is past the end of input, return an error.
if ( lexer . is_eof ( ) ) {
return { } ;
}
}
// 9. If the character indicated by position is a U+002E FULL STOP (.),
// and that is not the last character in input,
// and the character after the character indicated by position is an ASCII digit,
// then set value to zero and jump to the step labeled fraction.
if ( lexer . next_is ( ' . ' ) & & ( lexer . tell_remaining ( ) > 1 ) & & is_ascii_digit ( lexer . peek ( 1 ) ) ) {
value = 0 ;
goto fraction ;
}
// 10. If the character indicated by position is not an ASCII digit, then return an error.
if ( ! lexer . next_is ( is_ascii_digit ) ) {
return { } ;
}
// 11. Collect a sequence of code points that are ASCII digits from input given position, and interpret the resulting sequence as a base-ten integer.
// Multiply value by that integer.
{
size_t start_index = lexer . tell ( ) ;
lexer . consume_while ( is_ascii_digit ) ;
size_t end_index = lexer . tell ( ) ;
auto digits = lexer . input ( ) . substring_view ( start_index , end_index - start_index ) ;
2025-06-26 19:06:46 -04:00
auto optional_value = digits . to_number < double > ( TrimWhitespace : : No ) ;
2024-10-25 14:25:04 +02:00
value * = optional_value . value ( ) ;
}
// 12. If position is past the end of input, jump to the step labeled conversion.
if ( lexer . is_eof ( ) ) {
goto conversion ;
}
fraction : {
// 13. Fraction: If the character indicated by position is a U+002E FULL STOP (.), run these substeps:
if ( lexer . next_is ( ' . ' ) ) {
// 13.1. Advance position to the next character.
lexer . consume ( ) ;
// 13.2. If position is past the end of input,
// or if the character indicated by position is not an ASCII digit,
// U+0065 LATIN SMALL LETTER E (e), or U+0045 LATIN CAPITAL LETTER E (E),
// then jump to the step labeled conversion.
if ( lexer . is_eof ( ) | | ( ! lexer . next_is ( is_ascii_digit ) & & ! lexer . next_is ( ' e ' ) & & ! lexer . next_is ( ' E ' ) ) ) {
goto conversion ;
}
// 13.3. If the character indicated by position is a U+0065 LATIN SMALL LETTER E character (e) or a U+0045 LATIN CAPITAL LETTER E character (E),
// skip the remainder of these substeps.
if ( lexer . next_is ( ' e ' ) | | lexer . next_is ( ' E ' ) ) {
goto fraction_exit ;
}
// fraction_loop:
while ( true ) {
// 13.4. Fraction loop: Multiply divisor by ten.
divisor * = 10 ;
// 13.5. Add the value of the character indicated by position, interpreted as a base-ten digit (0..9) and divided by divisor, to value.
value + = ( lexer . peek ( ) - ' 0 ' ) / divisor ;
// 13.6. Advance position to the next character.
lexer . consume ( ) ;
// 13.7. If position is past the end of input, then jump to the step labeled conversion.
if ( lexer . is_eof ( ) ) {
goto conversion ;
}
// 13.8. If the character indicated by position is an ASCII digit, jump back to the step labeled fraction loop in these substeps.
if ( ! lexer . next_is ( is_ascii_digit ) ) {
break ;
}
}
}
fraction_exit :
}
// 14. If the character indicated by position is U+0065 (e) or a U+0045 (E), then:
if ( lexer . next_is ( ' e ' ) | | lexer . next_is ( ' E ' ) ) {
// 14.1. Advance position to the next character.
lexer . consume ( ) ;
// 14.2. If position is past the end of input, then jump to the step labeled conversion.
if ( lexer . is_eof ( ) ) {
goto conversion ;
}
// 14.3. If the character indicated by position is a U+002D HYPHEN-MINUS character (-):
if ( lexer . next_is ( ' - ' ) ) {
// 14.3.1. Change exponent to − 1.
exponent = - 1 ;
// 14.3.2. Advance position to the next character.
lexer . consume ( ) ;
// 14.3.3. If position is past the end of input, then jump to the step labeled conversion.
if ( lexer . is_eof ( ) ) {
goto conversion ;
}
}
// Otherwise, if the character indicated by position is a U+002B PLUS SIGN character (+):
else if ( lexer . next_is ( ' + ' ) ) {
// 14.3.1. Advance position to the next character.
lexer . consume ( ) ;
// 14.3.2. If position is past the end of input, then jump to the step labeled conversion.
if ( lexer . is_eof ( ) ) {
goto conversion ;
}
}
// 14.4. If the character indicated by position is not an ASCII digit, then jump to the step labeled conversion.
if ( ! lexer . next_is ( is_ascii_digit ) ) {
goto conversion ;
}
// 14.5. Collect a sequence of code points that are ASCII digits from input given position, and interpret the resulting sequence as a base-ten integer.
// Multiply exponent by that integer.
{
size_t start_index = lexer . tell ( ) ;
lexer . consume_while ( is_ascii_digit ) ;
size_t end_index = lexer . tell ( ) ;
auto digits = lexer . input ( ) . substring_view ( start_index , end_index - start_index ) ;
2025-06-26 19:06:46 -04:00
auto optional_value = digits . to_number < i32 > ( ) ;
2024-10-25 14:25:04 +02:00
exponent * = optional_value . value ( ) ;
}
// 14.6. Multiply value by ten raised to the exponentth power.
2024-10-30 16:34:10 +01:00
value * = pow ( 10 , exponent ) ;
2024-10-25 14:25:04 +02:00
}
2025-07-26 12:19:56 -04:00
conversion :
2024-10-25 14:25:04 +02:00
// 15. Conversion: Let S be the set of finite IEEE 754 double-precision floating-point values except − 0,
// but with two special values added: 2^1024 and − 2^1024.
2024-10-30 16:34:10 +01:00
if ( ! isfinite ( value ) ) {
2023-11-15 19:54:01 +01:00
return { } ;
2024-10-25 14:25:04 +02:00
}
2024-10-30 16:34:10 +01:00
if ( ( value = = 0 ) & & signbit ( value ) ) {
2024-10-25 14:25:04 +02:00
return 0 ;
}
// 16. Let rounded-value be the number in S that is closest to value, selecting the number with an even significand if there are two equally close values.
// (The two special values 2^1024 and − 2^1024 are considered to have even significands for this purpose.)
double rounded_value = value ;
// 17. If rounded-value is 2^1024 or − 2^1024, return an error.
2024-10-30 16:34:10 +01:00
if ( abs ( rounded_value ) > = pow ( 2 , 1024 ) ) {
2023-11-15 19:54:01 +01:00
return { } ;
2024-10-25 14:25:04 +02:00
}
// 18. Return rounded-value.
return rounded_value ;
}
2025-07-26 12:19:56 -04:00
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-floating-point-number-values
Optional < double > parse_floating_point_number ( Utf16String const & string )
{
// FIXME: Implement a UTF-16 GenericLexer.
if ( ! string . has_ascii_storage ( ) )
return { } ;
return parse_floating_point_number ( string . ascii_view ( ) ) ;
2023-11-15 19:54:01 +01:00
}
2024-08-27 18:25:01 +09:00
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#valid-floating-point-number
bool is_valid_floating_point_number ( StringView string )
{
GenericLexer lexer { string } ;
// 1. Optionally, a U+002D HYPHEN-MINUS character (-).
lexer . consume_specific ( ' - ' ) ;
// 2. One or both of the following, in the given order:
// 2.1. A series of one or more ASCII digits.
bool has_leading_digits = ! lexer . consume_while ( is_ascii_digit ) . is_empty ( ) ;
// 2.2. Both of the following, in the given order:
// 2.2.1. A single U+002E FULL STOP character (.).
if ( lexer . consume_specific ( ' . ' ) ) {
// 2.2.2. A series of one or more ASCII digits.
if ( lexer . consume_while ( is_ascii_digit ) . is_empty ( ) )
return false ;
} else if ( ! has_leading_digits ) {
// Doesn’ t begin with digits, doesn’ t begin with a full stop followed by digits.
return false ;
}
// 3. Optionally:
// 3.1. Either a U+0065 LATIN SMALL LETTER E character (e) or a U+0045 LATIN CAPITAL
// LETTER E character (E).
if ( lexer . consume_specific ( ' e ' ) | | lexer . consume_specific ( ' E ' ) ) {
// 3.2. Optionally, a U+002D HYPHEN-MINUS character (-) or U+002B PLUS SIGN
// character (+).
lexer . consume_specific ( ' - ' ) | | lexer . consume_specific ( ' + ' ) ;
// 3.3. A series of one or more ASCII digits.
if ( lexer . consume_while ( is_ascii_digit ) . is_empty ( ) )
return false ;
}
return lexer . tell_remaining ( ) = = 0 ;
}
2025-07-26 12:19:56 -04:00
// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#valid-floating-point-number
bool is_valid_floating_point_number ( Utf16String const & string )
{
// FIXME: Implement a UTF-16 GenericLexer.
if ( ! string . has_ascii_storage ( ) )
return false ;
return is_valid_floating_point_number ( string . ascii_view ( ) ) ;
}
2024-03-01 08:49:04 +01:00
WebIDL : : ExceptionOr < String > convert_non_negative_integer_to_string ( JS : : Realm & realm , WebIDL : : Long value )
{
if ( value < 0 )
2025-08-07 19:31:52 -04:00
return WebIDL : : IndexSizeError : : create ( realm , " The attribute is limited to only non-negative numbers " _utf16 ) ;
2024-10-14 10:05:01 +02:00
return String : : number ( value ) ;
2024-03-01 08:49:04 +01:00
}
2023-07-23 19:38:04 +03:00
}