2020-06-03 16:05:49 -07:00
/*
2021-04-22 16:53:07 -07:00
* Copyright ( c ) 2020 , Matthew Olsson < mattco @ serenityos . org >
2020-06-03 16:05:49 -07:00
*
2021-04-22 01:24:48 -07:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-06-03 16:05:49 -07:00
*/
2020-11-19 01:50:00 +03:30
# include <AK/Function.h>
2022-10-16 14:57:29 +02:00
# include <LibJS/Runtime/AbstractOperations.h>
2020-06-03 16:05:49 -07:00
# include <LibJS/Runtime/GlobalObject.h>
# include <LibJS/Runtime/PrimitiveString.h>
2022-10-16 14:57:29 +02:00
# include <LibJS/Runtime/RegExpConstructor.h>
2020-06-03 16:05:49 -07:00
# include <LibJS/Runtime/RegExpObject.h>
2021-07-22 08:04:31 -04:00
# include <LibJS/Runtime/StringPrototype.h>
2020-06-03 16:05:49 -07:00
# include <LibJS/Runtime/Value.h>
2021-10-05 18:33:28 +01:00
# include <LibJS/Token.h>
2020-06-03 16:05:49 -07:00
namespace JS {
2022-12-04 18:02:33 +00:00
Result < regex : : RegexOptions < ECMAScriptFlags > , DeprecatedString > regex_flags_from_string ( StringView flags )
2020-11-19 01:50:00 +03:30
{
2022-07-16 10:14:03 +04:30
bool d = false , g = false , i = false , m = false , s = false , u = false , y = false , v = false ;
2021-07-29 10:34:37 -04:00
auto options = RegExpObject : : default_flags ;
2020-11-19 01:50:00 +03:30
for ( auto ch : flags ) {
switch ( ch ) {
2021-07-09 16:10:17 -04:00
case ' d ' :
if ( d )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-07-09 16:10:17 -04:00
d = true ;
break ;
2020-11-19 01:50:00 +03:30
case ' g ' :
2021-05-11 22:47:14 +01:00
if ( g )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-05-11 22:47:14 +01:00
g = true ;
2021-07-29 10:34:37 -04:00
options | = regex : : ECMAScriptFlags : : Global ;
2020-11-19 01:50:00 +03:30
break ;
case ' i ' :
2021-05-11 22:47:14 +01:00
if ( i )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-05-11 22:47:14 +01:00
i = true ;
2021-07-29 10:34:37 -04:00
options | = regex : : ECMAScriptFlags : : Insensitive ;
2020-11-19 01:50:00 +03:30
break ;
case ' m ' :
2021-05-11 22:47:14 +01:00
if ( m )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-05-11 22:47:14 +01:00
m = true ;
2021-07-29 10:34:37 -04:00
options | = regex : : ECMAScriptFlags : : Multiline ;
2020-11-19 01:50:00 +03:30
break ;
case ' s ' :
2021-05-11 22:47:14 +01:00
if ( s )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-05-11 22:47:14 +01:00
s = true ;
2021-07-29 10:34:37 -04:00
options | = regex : : ECMAScriptFlags : : SingleLine ;
2020-11-19 01:50:00 +03:30
break ;
case ' u ' :
2021-05-11 22:47:14 +01:00
if ( u )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-05-11 22:47:14 +01:00
u = true ;
2021-07-29 10:34:37 -04:00
options | = regex : : ECMAScriptFlags : : Unicode ;
2020-11-19 01:50:00 +03:30
break ;
case ' y ' :
2021-05-11 22:47:14 +01:00
if ( y )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2021-05-11 22:47:14 +01:00
y = true ;
2020-11-19 01:50:00 +03:30
// Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default.
2021-07-29 10:34:37 -04:00
options . reset_flag ( regex : : ECMAScriptFlags : : Global ) ;
2020-11-19 01:50:00 +03:30
// "What's the difference between sticky and global, then", that's simple.
// all the other flags imply 'global', and the "global" flag implies 'stateful';
// however, the "sticky" flag does *not* imply 'global', only 'stateful'.
2021-07-29 10:34:37 -04:00
options | = ( regex : : ECMAScriptFlags ) regex : : AllFlags : : Internal_Stateful ;
options | = regex : : ECMAScriptFlags : : Sticky ;
2020-11-19 01:50:00 +03:30
break ;
2022-07-16 10:14:03 +04:30
case ' v ' :
if ( v )
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectRepeatedFlag . message ( ) , ch ) ;
2022-07-16 10:14:03 +04:30
v = true ;
options | = regex : : ECMAScriptFlags : : UnicodeSets ;
break ;
2020-11-19 01:50:00 +03:30
default :
2022-12-04 18:02:33 +00:00
return DeprecatedString : : formatted ( ErrorType : : RegExpObjectBadFlag . message ( ) , ch ) ;
2020-11-19 01:50:00 +03:30
}
}
return options ;
}
2022-12-04 18:02:33 +00:00
ErrorOr < DeprecatedString , ParseRegexPatternError > parse_regex_pattern ( StringView pattern , bool unicode , bool unicode_sets )
2021-07-29 10:34:37 -04:00
{
2022-07-16 10:14:03 +04:30
if ( unicode & & unicode_sets )
2022-12-04 18:02:33 +00:00
return ParseRegexPatternError { DeprecatedString : : formatted ( ErrorType : : RegExpObjectIncompatibleFlags . message ( ) , ' u ' , ' v ' ) } ;
2022-07-16 10:14:03 +04:30
2023-01-06 13:19:34 -05:00
auto utf16_pattern_result = AK : : utf8_to_utf16 ( pattern ) ;
if ( utf16_pattern_result . is_error ( ) )
return ParseRegexPatternError { " Out of memory " sv } ;
auto utf16_pattern = utf16_pattern_result . release_value ( ) ;
2021-07-29 10:34:37 -04:00
Utf16View utf16_pattern_view { utf16_pattern } ;
StringBuilder builder ;
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
for ( size_t i = 0 ; i < utf16_pattern_view . length_in_code_units ( ) ; ) {
2022-07-16 10:14:03 +04:30
if ( unicode | | unicode_sets ) {
2021-07-29 10:34:37 -04:00
auto code_point = code_point_at ( utf16_pattern_view , i ) ;
builder . append_code_point ( code_point . code_point ) ;
i + = code_point . code_unit_count ;
continue ;
}
u16 code_unit = utf16_pattern_view . code_unit_at ( i ) ;
+ + i ;
if ( code_unit > 0x7f )
builder . appendff ( " \\ u{:04x} " , code_unit ) ;
else
builder . append_code_point ( code_unit ) ;
}
2023-01-26 18:58:09 +00:00
return builder . to_deprecated_string ( ) ;
2021-07-29 10:34:37 -04:00
}
2022-12-04 18:02:33 +00:00
ThrowCompletionOr < DeprecatedString > parse_regex_pattern ( VM & vm , StringView pattern , bool unicode , bool unicode_sets )
2022-07-16 10:14:03 +04:30
{
auto result = parse_regex_pattern ( pattern , unicode , unicode_sets ) ;
if ( result . is_error ( ) )
2022-08-16 20:33:17 +01:00
return vm . throw_completion < JS : : SyntaxError > ( result . release_error ( ) . error ) ;
2022-07-16 10:14:03 +04:30
return result . release_value ( ) ;
}
2022-12-13 20:49:50 +00:00
NonnullGCPtr < RegExpObject > RegExpObject : : create ( Realm & realm )
2021-08-20 09:14:27 -04:00
{
2023-01-28 13:39:44 -05:00
return realm . heap ( ) . allocate < RegExpObject > ( realm , * realm . intrinsics ( ) . regexp_prototype ( ) ) . release_allocated_value_but_fixme_should_propagate_errors ( ) ;
2021-08-20 09:14:27 -04:00
}
2022-12-13 20:49:50 +00:00
NonnullGCPtr < RegExpObject > RegExpObject : : create ( Realm & realm , Regex < ECMA262 > regex , DeprecatedString pattern , DeprecatedString flags )
2020-06-03 16:05:49 -07:00
{
2023-01-28 13:39:44 -05:00
return realm . heap ( ) . allocate < RegExpObject > ( realm , move ( regex ) , move ( pattern ) , move ( flags ) , * realm . intrinsics ( ) . regexp_prototype ( ) ) . release_allocated_value_but_fixme_should_propagate_errors ( ) ;
2020-06-03 16:05:49 -07:00
}
2021-08-20 09:14:27 -04:00
RegExpObject : : RegExpObject ( Object & prototype )
2022-12-14 12:17:58 +01:00
: Object ( ConstructWithPrototypeTag : : Tag , prototype )
2021-08-20 09:14:27 -04:00
{
}
2022-12-04 18:02:33 +00:00
RegExpObject : : RegExpObject ( Regex < ECMA262 > regex , DeprecatedString pattern , DeprecatedString flags , Object & prototype )
2022-12-14 12:17:58 +01:00
: Object ( ConstructWithPrototypeTag : : Tag , prototype )
2021-07-29 10:34:37 -04:00
, m_pattern ( move ( pattern ) )
2021-07-22 08:04:31 -04:00
, m_flags ( move ( flags ) )
2021-07-29 10:34:37 -04:00
, m_regex ( move ( regex ) )
2020-06-03 16:05:49 -07:00
{
2021-08-20 09:14:27 -04:00
VERIFY ( m_regex - > parser_result . error = = regex : : Error : : NoError ) ;
2020-11-19 01:50:00 +03:30
}
2023-01-28 12:33:35 -05:00
ThrowCompletionOr < void > RegExpObject : : initialize ( Realm & realm )
2020-11-19 01:50:00 +03:30
{
2021-07-07 19:15:52 +03:00
auto & vm = this - > vm ( ) ;
2023-01-28 12:33:35 -05:00
MUST_OR_THROW_OOM ( Base : : initialize ( realm ) ) ;
2022-10-17 08:59:27 +08:00
2021-07-29 10:34:37 -04:00
define_direct_property ( vm . names . lastIndex , Value ( 0 ) , Attribute : : Writable ) ;
2023-01-28 12:33:35 -05:00
return { } ;
2020-11-19 01:50:00 +03:30
}
2021-08-20 09:14:27 -04:00
// 22.2.3.2.2 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize
2022-10-16 15:17:01 +02:00
ThrowCompletionOr < NonnullGCPtr < RegExpObject > > RegExpObject : : regexp_initialize ( VM & vm , Value pattern_value , Value flags_value )
2021-03-14 11:03:11 +01:00
{
2022-10-16 15:17:01 +02:00
// NOTE: This also contains changes adapted from https://arai-a.github.io/ecma262-compare/?pr=2418, which doesn't match the upstream spec anymore.
// 1. If pattern is undefined, let P be the empty String.
// 2. Else, let P be ? ToString(pattern).
auto pattern = pattern_value . is_undefined ( )
2022-12-04 18:02:33 +00:00
? DeprecatedString : : empty ( )
2023-01-13 10:29:02 -05:00
: TRY ( pattern_value . to_deprecated_string ( vm ) ) ;
2022-10-16 15:17:01 +02:00
// 3. If flags is undefined, let F be the empty String.
// 4. Else, let F be ? ToString(flags).
auto flags = flags_value . is_undefined ( )
2022-12-04 18:02:33 +00:00
? DeprecatedString : : empty ( )
2023-01-13 10:29:02 -05:00
: TRY ( flags_value . to_deprecated_string ( vm ) ) ;
2022-10-16 15:17:01 +02:00
// 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", or "y" or if it contains the same code unit more than once, throw a SyntaxError exception.
// 6. If F contains "i", let i be true; else let i be false.
// 7. If F contains "m", let m be true; else let m be false.
// 8. If F contains "s", let s be true; else let s be false.
// 9. If F contains "u", let u be true; else let u be false.
// 10. If F contains "v", let v be true; else let v be false.
auto parsed_flags_or_error = regex_flags_from_string ( flags ) ;
2021-10-23 03:49:29 +03:00
if ( parsed_flags_or_error . is_error ( ) )
2022-08-16 20:33:17 +01:00
return vm . throw_completion < SyntaxError > ( parsed_flags_or_error . release_error ( ) ) ;
2022-10-16 15:17:01 +02:00
auto parsed_flags = parsed_flags_or_error . release_value ( ) ;
2022-12-04 18:02:33 +00:00
auto parsed_pattern = DeprecatedString : : empty ( ) ;
2022-10-16 15:17:01 +02:00
if ( ! pattern . is_empty ( ) ) {
bool unicode = parsed_flags . has_flag_set ( regex : : ECMAScriptFlags : : Unicode ) ;
bool unicode_sets = parsed_flags . has_flag_set ( regex : : ECMAScriptFlags : : UnicodeSets ) ;
// 11. If u is true, then
// a. Let patternText be StringToCodePoints(P).
// 12. Else,
// a. Let patternText be the result of interpreting each of P's 16-bit elements as a Unicode BMP code point. UTF-16 decoding is not applied to the elements.
// 13. Let parseResult be ParsePattern(patternText, u, v).
parsed_pattern = TRY ( parse_regex_pattern ( vm , pattern , unicode , unicode_sets ) ) ;
}
2021-07-29 10:34:37 -04:00
2022-10-16 15:17:01 +02:00
// 14. If parseResult is a non-empty List of SyntaxError objects, throw a SyntaxError exception.
Regex < ECMA262 > regex ( move ( parsed_pattern ) , parsed_flags ) ;
2021-10-23 03:49:29 +03:00
if ( regex . parser_result . error ! = regex : : Error : : NoError )
2022-08-16 20:33:17 +01:00
return vm . throw_completion < SyntaxError > ( ErrorType : : RegExpCompileError , regex . error_string ( ) ) ;
2021-07-22 08:04:31 -04:00
2022-10-16 15:17:01 +02:00
// 15. Assert: parseResult is a Pattern Parse Node.
VERIFY ( regex . parser_result . error = = regex : : Error : : NoError ) ;
// 16. Set obj.[[OriginalSource]] to P.
m_pattern = move ( pattern ) ;
// 17. Set obj.[[OriginalFlags]] to F.
m_flags = move ( flags ) ;
// 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult).
// 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }.
// 20. Set obj.[[RegExpRecord]] to rer.
// 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer.
2021-08-20 09:14:27 -04:00
m_regex = move ( regex ) ;
2022-10-16 15:17:01 +02:00
// 22. Perform ? Set(obj, "lastIndex", +0𝔽 , true).
2021-10-23 03:49:29 +03:00
TRY ( set ( vm . names . lastIndex , Value ( 0 ) , Object : : ShouldThrowExceptions : : Yes ) ) ;
2021-08-20 09:14:27 -04:00
2022-10-16 15:17:01 +02:00
// 23. Return obj.
2022-10-16 14:57:29 +02:00
return NonnullGCPtr { * this } ;
2021-08-20 09:14:27 -04:00
}
2021-10-05 18:33:28 +01:00
// 22.2.3.2.5 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern
2022-12-04 18:02:33 +00:00
DeprecatedString RegExpObject : : escape_regexp_pattern ( ) const
2021-10-05 18:33:28 +01:00
{
2022-10-16 15:17:01 +02:00
// 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent
// to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as
// described below. S may or may not be identical to P; however, the Abstract Closure that would result from
// evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to
// the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to
// this abstract operation using the same values for P and F must produce identical results.
// 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure
// that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a
// RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is
// "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F
// would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this
// specification can be met by letting S be "(?:)".
// 3. Return S.
2021-10-05 18:33:28 +01:00
if ( m_pattern . is_empty ( ) )
return " (?:) " ;
2023-02-15 17:55:13 +03:30
2022-07-16 10:14:03 +04:30
// FIXME: Check the 'u' and 'v' flags and escape accordingly
2023-02-15 17:55:13 +03:30
StringBuilder builder ;
auto pattern = Utf8View { m_pattern } ;
auto escaped = false ;
for ( auto code_point : pattern ) {
if ( escaped ) {
escaped = false ;
builder . append_code_point ( ' \\ ' ) ;
builder . append_code_point ( code_point ) ;
continue ;
}
if ( code_point = = ' \\ ' ) {
escaped = true ;
continue ;
}
if ( code_point = = ' \r ' | | code_point = = LINE_SEPARATOR | | code_point = = PARAGRAPH_SEPARATOR | | code_point = = ' / ' ) {
builder . append_code_point ( ' \\ ' ) ;
}
builder . append_code_point ( code_point ) ;
}
return builder . to_deprecated_string ( ) ;
2021-10-05 18:33:28 +01:00
}
2021-08-20 09:14:27 -04:00
// 22.2.3.2.4 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
2022-10-16 14:57:29 +02:00
ThrowCompletionOr < NonnullGCPtr < RegExpObject > > regexp_create ( VM & vm , Value pattern , Value flags )
2021-08-20 09:14:27 -04:00
{
2022-08-21 16:14:51 +01:00
auto & realm = * vm . current_realm ( ) ;
2022-10-16 15:17:01 +02:00
// 1. Let obj be ! RegExpAlloc(%RegExp%).
2022-10-16 14:57:29 +02:00
auto regexp_object = MUST ( regexp_alloc ( vm , * realm . intrinsics ( ) . regexp_constructor ( ) ) ) ;
2022-10-16 15:17:01 +02:00
// 2. Return ? RegExpInitialize(obj, P, F).
2022-08-21 16:14:51 +01:00
return TRY ( regexp_object - > regexp_initialize ( vm , pattern , flags ) ) ;
2021-03-14 11:03:11 +01:00
}
2022-10-16 14:57:29 +02:00
// 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc
2022-10-17 08:59:27 +08:00
// 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget-
2022-10-16 14:57:29 +02:00
ThrowCompletionOr < NonnullGCPtr < RegExpObject > > regexp_alloc ( VM & vm , FunctionObject & new_target )
{
// 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", « [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] »).
2022-12-14 18:34:32 +00:00
auto regexp_object = TRY ( ordinary_create_from_constructor < RegExpObject > ( vm , new_target , & Intrinsics : : regexp_prototype ) ) ;
2022-10-16 14:57:29 +02:00
2022-10-17 08:59:27 +08:00
// 2. Let thisRealm be the current Realm Record.
auto & this_realm = * vm . current_realm ( ) ;
// 3. Set the value of obj’ s [[Realm]] internal slot to thisRealm.
regexp_object - > set_realm ( this_realm ) ;
// 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then
auto * regexp_constructor = this_realm . intrinsics ( ) . regexp_constructor ( ) ;
if ( same_value ( & new_target , regexp_constructor ) ) {
// i. Set the value of obj’ s [[LegacyFeaturesEnabled]] internal slot to true.
regexp_object - > set_legacy_features_enabled ( true ) ;
}
// 5. Else,
else {
// i. Set the value of obj’ s [[LegacyFeaturesEnabled]] internal slot to false.
regexp_object - > set_legacy_features_enabled ( false ) ;
}
// 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }).
2022-10-16 14:57:29 +02:00
MUST ( regexp_object - > define_property_or_throw ( vm . names . lastIndex , PropertyDescriptor { . writable = true , . enumerable = false , . configurable = false } ) ) ;
2022-10-17 08:59:27 +08:00
// 7. Return obj.
2022-12-14 18:34:32 +00:00
return regexp_object ;
2022-10-16 14:57:29 +02:00
}
2020-06-03 16:05:49 -07:00
}