diff --git a/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Libraries/LibJS/Runtime/RegExpPrototype.cpp index 10d7613ab83..c6c308fa8da 100644 --- a/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -271,21 +271,23 @@ static ThrowCompletionOr regexp_builtin_exec(VM& vm, RegExpObject& regexp // 22. Perform ! CreateDataPropertyOrThrow(A, "index", ð”½(lastIndex)). MUST(array->create_data_property_or_throw(vm.names.index, Value(match_index))); - // 24. Let match be the Match { [[StartIndex]]: lastIndex, [[EndIndex]]: e }. + // 23. Perform ! CreateDataPropertyOrThrow(A, "input", S). + MUST(array->create_data_property_or_throw(vm.names.input, string)); + + // 24. Let match be the Match Record { [[StartIndex]]: lastIndex, [[EndIndex]]: e }. auto match_indices = Match::create(match); // 25. Let indices be a new empty List. Vector> indices; - Vector captured_values; // 26. Let groupNames be a new empty List. - HashMap group_names; + Vector group_names; - // 27. Add match as the last element of indices. + // 27. Append match to indices. indices.append(move(match_indices)); - // 28. Let matchedValue be ! GetMatchString(S, match). - // 29. Perform ! CreateDataPropertyOrThrow(A, "0", matchedValue). + // 28. Let matchedSubstr be GetMatchString(S, match). + // 29. Perform ! CreateDataPropertyOrThrow(A, "0", matchedSubstr). MUST(array->create_data_property_or_throw(0, PrimitiveString::create(vm, match.view.u16_view()))); // 30. If R contains any GroupName, then @@ -295,11 +297,19 @@ static ThrowCompletionOr regexp_builtin_exec(VM& vm, RegExpObject& regexp // a. Let groups be undefined. // b. Let hasGroups be false. bool has_groups = result.n_named_capture_groups != 0; - auto groups_object = has_groups ? Object::create(realm, nullptr) : GC::Ptr {}; + auto groups = has_groups ? Object::create(realm, nullptr) : js_undefined(); - // 33. For each integer i such that i ≥ 1 and i ≤ n, in ascending order, do + // 32. Perform ! CreateDataPropertyOrThrow(A, "groups", groups). + MUST(array->create_data_property_or_throw(vm.names.groups, groups)); + + // 33. Let matchedGroupNames be a new empty List. + Vector matched_group_names; + Vector captured_values; + + // 34. For each integer i such that 1 ≤ i ≤ n, in ascending order, do for (size_t i = 1; i <= result.n_capture_groups; ++i) { - // a. Let captureI be ith element of r's captures List. + + // a. Let captureI be ith element of r.[[Captures]]. auto& capture = result.capture_group_matches[0][i - 1]; Value captured_value; @@ -310,23 +320,21 @@ static ThrowCompletionOr regexp_builtin_exec(VM& vm, RegExpObject& regexp captured_value = js_undefined(); // ii. Append undefined to indices. indices.append({}); - // iii. Append capture to indices. captured_values.append({}); } // c. Else, else { - // i. Let captureStart be captureI's startIndex. - // ii. Let captureEnd be captureI's endIndex. + // i. Let captureStart be captureI.[[StartIndex]]. + // ii. Let captureEnd be captureI.[[EndIndex]]. // iii. If fullUnicode is true, then - // 1. Set captureStart to ! GetStringIndex(S, Input, captureStart). - // 2. Set captureEnd to ! GetStringIndex(S, Input, captureEnd). - // iv. Let capture be the Match { [[StartIndex]]: captureStart, [[EndIndex]: captureEnd }. - // v. Let capturedValue be ! GetMatchString(S, capture). + // 1. Set captureStart to GetStringIndex(S, captureStart). + // 2. Set captureEnd to GetStringIndex(S, captureEnd). + // iv. Let capture be the Match Record { [[StartIndex]]: captureStart, [[EndIndex]]: captureEnd }. + // v. Let capturedValue be GetMatchString(S, capture). auto capture_as_utf16_string = Utf16String::from_utf16(capture.view.u16_view()); captured_value = PrimitiveString::create(vm, capture_as_utf16_string); // vi. Append capture to indices. indices.append(Match::create(capture)); - // vii. Append capturedValue to the end of capturedValues. captured_values.append(capture_as_utf16_string); } @@ -335,22 +343,51 @@ static ThrowCompletionOr regexp_builtin_exec(VM& vm, RegExpObject& regexp // e. If the ith capture of R was defined with a GroupName, then if (capture.capture_group_name >= 0) { - // i. Let s be the CapturingGroupName of the corresponding RegExpIdentifierName. + // i. Let s be the CapturingGroupName of that GroupName. auto group_name = Utf16FlyString::from_utf8(regex.parser_result.bytecode.get_string(capture.capture_group_name)); - // ii. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue). - MUST(groups_object->create_data_property_or_throw(group_name, captured_value)); - - // iii. Append s to groupNames. - group_names.set(move(group_name), Match::create(capture)); + // ii. If matchedGroupNames contains s, then + if (matched_group_names.contains_slow(group_name)) { + // 1. Assert: capturedValue is undefined. + VERIFY(captured_value.is_undefined()); + // 2. Append undefined to groupNames. + group_names.append({}); + } + // iii. Else, + else { + // 1. If capturedValue is not undefined, append s to matchedGroupNames. + if (!captured_value.is_undefined()) + matched_group_names.append(group_name); + // 2. NOTE: If there are multiple groups named s, groups may already have an s property at this point. + // However, because groups is an ordinary object whose properties are all writable data properties, + // the call to CreateDataPropertyOrThrow is nevertheless guaranteed to succeed. + // 3. Perform ! CreateDataPropertyOrThrow(groups, s, capturedValue). + MUST(groups.as_object().create_data_property_or_throw(group_name, captured_value)); + // 4. Append s to groupNames. + group_names.append(group_name.to_utf16_string()); + } } // f. Else, else { // i. Append undefined to groupNames. - // See the note in MakeIndicesArray for why this step is skipped. + group_names.append({}); } } + // Ensure named groups are enumerated in source order + if (has_groups) { + auto original_groups = groups; + groups = Object::create(realm, nullptr); + + for (auto const& group_name_str : regex.parser_result.capture_groups) { + auto group_name = Utf16FlyString::from_utf8(group_name_str); + auto value = original_groups.as_object().get_without_side_effects(group_name); + MUST(groups.as_object().create_data_property_or_throw(group_name, value)); + } + + MUST(array->set(vm.names.groups, groups, Object::ShouldThrowExceptions::Yes)); + } + // https://github.com/tc39/proposal-regexp-legacy-features#regexpbuiltinexec--r-s- // 5. Let thisRealm be the current Realm Record. auto* this_realm = &realm; @@ -370,24 +407,39 @@ static ThrowCompletionOr regexp_builtin_exec(VM& vm, RegExpObject& regexp } } - // 32. Perform ! CreateDataPropertyOrThrow(A, "groups", groups). - // NOTE: This step must be performed after the above loop in order for groups to be populated. - Value groups = has_groups ? groups_object : js_undefined(); - MUST(array->create_data_property_or_throw(vm.names.groups, groups)); - - // 34. If hasIndices is true, then + // 35. If hasIndices is true, then if (has_indices) { // a. Let indicesArray be MakeMatchIndicesIndexPairArray(S, indices, groupNames, hasGroups). - auto indices_array = make_match_indices_index_pair_array(vm, string->utf16_string_view(), indices, group_names, has_groups); - // b. Perform ! CreateDataProperty(A, "indices", indicesArray). - MUST(array->create_data_property(vm.names.indices, indices_array)); + HashMap indices_group_names; + for (size_t i = 0; i < group_names.size(); ++i) { + if (!group_names[i].is_empty()) { + auto& capture = result.capture_group_matches[0][i]; + if (!capture.view.is_null()) { + indices_group_names.set(Utf16FlyString { group_names[i] }, Match::create(capture)); + } + } + } + auto indices_array = make_match_indices_index_pair_array(vm, string->utf16_string_view(), indices, indices_group_names, has_groups); + + // Make sure indices.groups includes all named groups in source order + if (has_groups) { + auto& indices_groups_object = indices_array.as_object().get_without_side_effects(vm.names.groups).as_object(); + auto ordered_indices_groups_object = Object::create(realm, nullptr); + + for (auto const& group_name_str : regex.parser_result.capture_groups) { + auto group_name = Utf16FlyString::from_utf8(group_name_str); + auto value = indices_groups_object.get_without_side_effects(group_name); + MUST(ordered_indices_groups_object->create_data_property_or_throw(group_name, value)); + } + + MUST(indices_array.as_object().set(vm.names.groups, ordered_indices_groups_object, Object::ShouldThrowExceptions::Yes)); + } + + // b. Perform ! CreateDataPropertyOrThrow(A, "indices", indicesArray). + MUST(array->create_data_property_or_throw(vm.names.indices, indices_array)); } - // 23. Perform ! CreateDataPropertyOrThrow(A, "input", S). - // NOTE: This step is performed last to allow the string to be moved into the PrimitiveString::create() invocation. - MUST(array->create_data_property_or_throw(vm.names.input, string)); - - // 35. Return A. + // 36. Return A. return array; } diff --git a/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js index e861ace858d..982062fe855 100644 --- a/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js +++ b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js @@ -226,3 +226,105 @@ test("cached UTF-16 code point length", () => { expect(match.codePointAt(0)).toBe(0x1f600); }); + +test("named groups source order", () => { + // Test that named groups appear in source order, not match order + let re = /(?a)(?a)|(?b)(?b)/; + + let result1 = re.exec("aa"); + expect(Object.keys(result1.groups)).toEqual(["y", "x"]); + expect(result1.groups.y).toBe("a"); + expect(result1.groups.x).toBe("a"); + + let result2 = re.exec("bb"); + expect(Object.keys(result2.groups)).toEqual(["y", "x"]); + expect(result2.groups.y).toBe("b"); + expect(result2.groups.x).toBe("b"); +}); + +test("named groups all present in groups object", () => { + // Test that all named groups appear in groups object, even unmatched ones + let re = /(?.)|(?.)/u; + + let result = re.exec("abcd"); + expect(Object.getOwnPropertyNames(result.groups)).toEqual(["fst", "snd"]); + expect(result.groups.fst).toBe("a"); + expect(result.groups.snd).toBe(undefined); +}); + +test("named groups with hasIndices flag", () => { + // Test that indices.groups also contains all named groups in source order + let re = /(?.)|(?.)/du; + + let result = re.exec("abcd"); + expect(Object.getOwnPropertyNames(result.indices.groups)).toEqual(["fst", "snd"]); + expect(result.indices.groups.fst).toEqual([0, 1]); + expect(result.indices.groups.snd).toBe(undefined); +}); + +test("complex named groups ordering", () => { + // Test multiple groups in different order + let re = /(?c)|(?a)|(?b)/; + + let result1 = re.exec("a"); + expect(Object.keys(result1.groups)).toEqual(["third", "first", "second"]); + expect(result1.groups.third).toBe(undefined); + expect(result1.groups.first).toBe("a"); + expect(result1.groups.second).toBe(undefined); + + let result2 = re.exec("b"); + expect(Object.keys(result2.groups)).toEqual(["third", "first", "second"]); + expect(result2.groups.third).toBe(undefined); + expect(result2.groups.first).toBe(undefined); + expect(result2.groups.second).toBe("b"); + + let result3 = re.exec("c"); + expect(Object.keys(result3.groups)).toEqual(["third", "first", "second"]); + expect(result3.groups.third).toBe("c"); + expect(result3.groups.first).toBe(undefined); + expect(result3.groups.second).toBe(undefined); +}); + +test("forward references to named groups", () => { + // Self-reference inside group + let result1 = /(?\k\w)../.exec("bab"); + expect(result1).not.toBe(null); + expect(result1[0]).toBe("bab"); + expect(result1[1]).toBe("b"); + expect(result1.groups.a).toBe("b"); + + // Reference before group definition + let result2 = /\k(?b)\w\k/.exec("bab"); + expect(result2).not.toBe(null); + expect(result2[0]).toBe("bab"); + expect(result2[1]).toBe("b"); + expect(result2.groups.a).toBe("b"); + + let result3 = /(?b)\k(?a)\k/.exec("bab"); + expect(result3).not.toBe(null); + expect(result3[0]).toBe("bab"); + expect(result3[1]).toBe("b"); + expect(result3[2]).toBe("a"); + expect(result3.groups.a).toBe("a"); + expect(result3.groups.b).toBe("b"); + + // Backward reference + let result4 = /(?a)(?b)\k/.exec("aba"); + expect(result4).not.toBe(null); + expect(result4[0]).toBe("aba"); + expect(result4.groups.a).toBe("a"); + expect(result4.groups.b).toBe("b"); + + // Mixed forward/backward with alternation + let result5 = /(?a)(?b)\k|(?c)/.exec("aba"); + expect(result5).not.toBe(null); + expect(result5.groups.a).toBe("a"); + expect(result5.groups.b).toBe("b"); + expect(result5.groups.c).toBe(undefined); +}); + +test("invalid named group references", () => { + expect(() => { + new RegExp("(?x)\\k"); + }).toThrow(); +}); diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp index 74604826cd9..4a2d1cacbfc 100644 --- a/Libraries/LibRegex/RegexByteCode.cpp +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -609,12 +609,21 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M } case CharacterCompareType::Reference: { auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1; - if (input.match_index >= state.capture_group_matches_size()) - return ExecutionResult::Failed_ExecuteLowPrioForks; + if (input.match_index >= state.capture_group_matches_size()) { + had_zero_length_match = true; + if (current_inversion_state()) + inverse_matched = true; + break; + } auto groups = state.capture_group_matches(input.match_index); - if (groups.size() <= reference_number) - return ExecutionResult::Failed_ExecuteLowPrioForks; + + if (groups.size() <= reference_number) { + had_zero_length_match = true; + if (current_inversion_state()) + inverse_matched = true; + break; + } auto str = groups.at(reference_number).view; @@ -628,6 +637,59 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M } break; } + case CharacterCompareType::NamedReference: { + auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1; + + if (input.match_index >= state.capture_group_matches_size()) { + had_zero_length_match = true; + if (current_inversion_state()) + inverse_matched = true; + break; + } + + auto groups = state.capture_group_matches(input.match_index); + + if (groups.size() <= reference_number) { + had_zero_length_match = true; + if (current_inversion_state()) + inverse_matched = true; + break; + } + + RegexStringView str {}; + + auto reference_name_index = m_bytecode->get_group_name_index(reference_number); + + if (reference_name_index.has_value()) { + auto target_name_string = m_bytecode->get_string(reference_name_index.value()); + + for (size_t i = 0; i < groups.size(); ++i) { + if (groups[i].view.is_null()) + continue; + + auto group_name_index = m_bytecode->get_group_name_index(i); + + if (group_name_index.has_value()) { + auto group_name_string = m_bytecode->get_string(group_name_index.value()); + + if (group_name_string == target_name_string) { + str = groups[i].view; + break; + } + } + } + } + + if (input.view.length() < state.string_position + str.length()) { + return ExecutionResult::Failed_ExecuteLowPrioForks; + } + + if (compare_string(input, state, str, had_zero_length_match)) { + if (current_inversion_state()) + inverse_matched = true; + } + break; + } case CharacterCompareType::Property: { auto property = static_cast(m_bytecode->at(offset++)); compare_property(input, state, property, current_inversion_state(), inverse_matched); @@ -946,6 +1008,9 @@ Vector OpCode_Compare::flat_compares() const } else if (compare_type == CharacterCompareType::Reference) { auto ref = m_bytecode->at(offset++); result.append({ compare_type, ref }); + } else if (compare_type == CharacterCompareType::NamedReference) { + auto ref = m_bytecode->at(offset++); + result.append({ compare_type, ref }); } else if (compare_type == CharacterCompareType::String) { auto& length = m_bytecode->at(offset++); for (size_t k = 0; k < length; ++k) @@ -1028,6 +1093,24 @@ Vector OpCode_Compare::variable_arguments_to_byte_string(Optionalmatch_index, state().capture_group_matches_size() - 1)); } } + } else if (compare_type == CharacterCompareType::NamedReference) { + auto ref = m_bytecode->at(offset++); + result.empend(ByteString::formatted(" named_number={}", ref)); + if (input.has_value()) { + if (state().capture_group_matches_size() > input->match_index) { + auto match = state().capture_group_matches(input->match_index); + if (match.size() > ref) { + auto& group = match[ref]; + result.empend(ByteString::formatted(" left={}", group.left_column)); + result.empend(ByteString::formatted(" right={}", group.left_column + group.view.length_in_code_units())); + result.empend(ByteString::formatted(" contents='{}'", group.view)); + } else { + result.empend(ByteString::formatted(" (invalid ref {}, max={})", ref, match.size() - 1)); + } + } else { + result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches_size() - 1)); + } + } } else if (compare_type == CharacterCompareType::String) { auto& length = m_bytecode->at(offset++); StringBuilder str_builder; diff --git a/Libraries/LibRegex/RegexByteCode.h b/Libraries/LibRegex/RegexByteCode.h index 8cf285bd0c2..458a1a370e5 100644 --- a/Libraries/LibRegex/RegexByteCode.h +++ b/Libraries/LibRegex/RegexByteCode.h @@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ @@ -261,6 +262,11 @@ public: FlyString get_string(size_t index) const { return m_string_table.get(index); } auto const& string_table() const { return m_string_table; } + Optional get_group_name_index(size_t group_index) const + { + return m_group_name_mappings.get(group_index); + } + void last_chunk() const = delete; void first_chunk() const = delete; @@ -279,6 +285,10 @@ public: m_string_table.m_table.set(entry.key, entry.value); } m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table); + + for (auto const& mapping : other.m_group_name_mappings) { + m_group_name_mappings.set(mapping.key, mapping.value); + } } } @@ -326,8 +336,11 @@ public: void insert_bytecode_group_capture_right(size_t capture_groups_count, FlyString name) { empend(static_cast(OpCodeId::SaveRightNamedCaptureGroup)); - empend(m_string_table.set(move(name))); + auto name_string_index = m_string_table.set(move(name)); + empend(name_string_index); empend(capture_groups_count); + + m_group_name_mappings.set(capture_groups_count - 1, name_string_index); } enum class LookAroundType { @@ -618,6 +631,7 @@ private: static bool s_opcodes_initialized; static size_t s_next_checkpoint_serial_id; StringTable m_string_table; + HashMap m_group_name_mappings; }; #define ENUMERATE_EXECUTION_RESULTS \ diff --git a/Libraries/LibRegex/RegexOptimizer.cpp b/Libraries/LibRegex/RegexOptimizer.cpp index 4a4738cad7c..00651711c8c 100644 --- a/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Libraries/LibRegex/RegexOptimizer.cpp @@ -131,6 +131,7 @@ static bool interpret_compares(Vector const& lhs, Stati // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it. return false; case CharacterCompareType::Reference: + case CharacterCompareType::NamedReference: // We've handled this before coming here. break; case CharacterCompareType::Property: @@ -512,6 +513,7 @@ static bool has_overlap(Vector const& lhs, Vector regex_options) @@ -182,10 +183,15 @@ Parser::Result Parser::parse(Optional regex_options) reset(); if (regex_options.has_value()) m_parser_state.regex_options = regex_options.value(); - if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) + if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) { consume(TokenType::Eof, Error::InvalidPattern); - else + if (!resolve_forward_named_references()) + set_error(Error::InvalidNameForCaptureGroup); + } else { set_error(Error::InvalidPattern); + } + + auto capture_groups = m_parser_state.named_capture_groups.keys(); dbgln_if(REGEX_DEBUG, "[PARSER] Produced bytecode with {} entries (opcodes + arguments)", m_parser_state.bytecode.size()); return { @@ -195,7 +201,7 @@ Parser::Result Parser::parse(Optional regex_options) move(m_parser_state.match_length_minimum), move(m_parser_state.error), move(m_parser_state.error_token), - m_parser_state.named_capture_groups.keys(), + move(capture_groups), m_parser_state.regex_options, }; } @@ -496,7 +502,6 @@ bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match if (try_skip({ backref_name, 2 })) { if (!m_capture_group_seen[i - 1]) return set_error(Error::InvalidNumber); - match_length_minimum += m_capture_group_minimum_lengths[i - 1]; bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } }); return true; } @@ -1640,24 +1645,32 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } auto it = m_parser_state.named_capture_groups.find(name); - if (it == m_parser_state.named_capture_groups.end()) { - set_error(Error::InvalidNameForCaptureGroup); - return false; + if (it != m_parser_state.named_capture_groups.end()) { + + // Use the first occurrence of the named group for the backreference + // This follows ECMAScript behavior where \k refers to the first + // group with that name in left-to-right order, regardless of alternative + auto group_index = it->value.first().group_index; + auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index); + if (maybe_length.has_value()) { + // Backward reference + stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast(group_index) } }); + } else { + // Self-reference or forward reference + auto placeholder_index = 0; + auto bytecode_offset = stack.size(); + stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast(placeholder_index) } }); + + m_parser_state.unresolved_named_references.append({ name, bytecode_offset + 1 }); + } + } else { + // Forward reference + auto placeholder_index = 0; + auto bytecode_offset = stack.size(); + stack.insert_bytecode_compare_values({ { CharacterCompareType::NamedReference, static_cast(placeholder_index) } }); + + m_parser_state.unresolved_named_references.append({ name, bytecode_offset + 1 }); } - - // Use the first occurrence of the named group for the backreference - // This follows ECMAScript behavior where \k refers to the first - // group with that name in left-to-right order, regardless of alternative - auto group_index = it->value.first().group_index; - - auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index); - if (!maybe_length.has_value()) { - set_error(Error::InvalidNameForCaptureGroup); - return false; - } - - match_length_minimum += maybe_length.value(); - stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)group_index } }); return true; } @@ -2706,7 +2719,8 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi return false; } - m_parser_state.named_capture_groups.ensure(name).append({ group_index, m_current_alternative_id }); + auto& group_vector = m_parser_state.named_capture_groups.ensure(name); + group_vector.append({ group_index, m_current_alternative_id }); ByteCode capture_group_bytecode; size_t length = 0; @@ -2816,4 +2830,20 @@ size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis() return count; } +bool Parser::resolve_forward_named_references() +{ + for (auto const& unresolved_ref : m_parser_state.unresolved_named_references) { + auto it = m_parser_state.named_capture_groups.find(unresolved_ref.name); + if (it == m_parser_state.named_capture_groups.end()) { + return false; + } + + auto group_index = it->value.first().group_index; + + m_parser_state.bytecode.at(unresolved_ref.bytecode_offset) = (ByteCodeValueType)group_index; + } + + return true; +} + } diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h index f9d80081a39..9b2210a9bc5 100644 --- a/Libraries/LibRegex/RegexParser.h +++ b/Libraries/LibRegex/RegexParser.h @@ -90,6 +90,7 @@ public: protected: virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0; + bool resolve_forward_named_references(); ALWAYS_INLINE bool match(TokenType type) const; ALWAYS_INLINE bool match(char ch) const; @@ -120,7 +121,13 @@ protected: size_t repetition_mark_count { 0 }; AllOptions regex_options; HashMap capture_group_minimum_lengths; - HashMap> named_capture_groups; + OrderedHashMap> named_capture_groups; + + struct UnresolvedNamedReference { + FlyString name; + size_t bytecode_offset; + }; + Vector unresolved_named_references; explicit ParserState(Lexer& lexer) : lexer(lexer) diff --git a/Libraries/LibWeb/CSS/CSSImportRule.cpp b/Libraries/LibWeb/CSS/CSSImportRule.cpp index 975c9215f56..5e7e2ce18bf 100644 --- a/Libraries/LibWeb/CSS/CSSImportRule.cpp +++ b/Libraries/LibWeb/CSS/CSSImportRule.cpp @@ -2,6 +2,7 @@ * Copyright (c) 2021, the SerenityOS developers. * Copyright (c) 2021-2024, Sam Atkins * Copyright (c) 2022-2024, Andreas Kling + * Copyright (c) 2025, Lorenz Ackermann * * SPDX-License-Identifier: BSD-2-Clause */ @@ -146,18 +147,17 @@ void CSSImportRule::fetch() // 4. Let importedStylesheet be the result of parsing byteStream given parsedUrl. // FIXME: Tidy up our parsing API. For now, do the decoding here. - // FIXME: Get the encoding from the response somehow. - auto encoding = "utf-8"sv; - auto maybe_decoder = TextCodec::decoder_for(encoding); - if (!maybe_decoder.has_value()) { - dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Unsupported encoding: {}", parsed_url, encoding); - return; + Optional mime_type_charset; + if (auto extracted_mime_type = response->header_list()->extract_mime_type(); extracted_mime_type.has_value()) { + if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value()) + mime_type_charset = charset.value(); } - auto& decoder = maybe_decoder.release_value(); - - auto decoded_or_error = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(decoder, *byte_stream); + // The environment encoding of an imported style sheet is the encoding of the style sheet that imported it. [css-syntax-3] + // FIXME: Save encoding on Stylesheet to get it here + Optional environment_encoding; + auto decoded_or_error = css_decode_bytes(environment_encoding, mime_type_charset, *byte_stream); if (decoded_or_error.is_error()) { - dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {} Encoding was: {}", parsed_url, encoding); + dbgln_if(CSS_LOADER_DEBUG, "CSSImportRule: Failed to decode CSS file: {}", parsed_url); return; } auto decoded = decoded_or_error.release_value(); diff --git a/Libraries/LibWeb/CSS/Parser/Helpers.cpp b/Libraries/LibWeb/CSS/Parser/Helpers.cpp index cc34e08feba..1a8c31cd9fd 100644 --- a/Libraries/LibWeb/CSS/Parser/Helpers.cpp +++ b/Libraries/LibWeb/CSS/Parser/Helpers.cpp @@ -4,10 +4,12 @@ * Copyright (c) 2021-2024, Sam Atkins * Copyright (c) 2021, Tobias Christiansen * Copyright (c) 2022, MacDue + * Copyright (c) 2025, Lorenz Ackermann * * SPDX-License-Identifier: BSD-2-Clause */ +#include #include #include #include @@ -138,4 +140,75 @@ Vector parse_component_values_list(CSS::Parser::Par return CSS::Parser::Parser::create(parsing_params, string).parse_as_list_of_component_values(); } +// https://drafts.csswg.org/css-syntax/#css-decode-bytes +ErrorOr css_decode_bytes(Optional const& environment_encoding, Optional mime_type_charset, ByteBuffer const& encoded_string) +{ + // https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding + auto determine_the_fallback_encoding = [&mime_type_charset, &environment_encoding, &encoded_string]() -> StringView { + // 1. If HTTP or equivalent protocol provides an encoding label (e.g. via the charset parameter of the Content-Type header) for the stylesheet, + // get an encoding from encoding label. If that does not return failure, return it. + if (mime_type_charset.has_value()) { + if (auto encoding = TextCodec::get_standardized_encoding(mime_type_charset.value()); encoding.has_value()) + return encoding.value(); + } + // 2. Otherwise, check stylesheet’s byte stream. If the first 1024 bytes of the stream begin with the hex sequence + // 40 63 68 61 72 73 65 74 20 22 XX* 22 3B + // where each XX byte is a value between 0x16 and 0x21 inclusive or a value between 0x23 and 0x7F inclusive, + // then get an encoding from a string formed out of the sequence of XX bytes, interpreted as ASCII. + auto check_stylesheets_byte_stream = [&encoded_string]() -> Optional { + size_t scan_length = min(encoded_string.size(), 1024); + auto pattern_start = "@charset \""sv; + auto pattern_end = "\";"sv; + + if (scan_length < pattern_start.length()) + return {}; + + StringView buffer_view = encoded_string.bytes().slice(0, scan_length); + if (!buffer_view.starts_with(pattern_start)) + return {}; + + auto encoding_start = pattern_start.length(); + auto end_index = buffer_view.find(pattern_end, encoding_start); + if (!end_index.has_value()) + return {}; + + size_t encoding_length = end_index.value() - encoding_start; + auto encoding_view = buffer_view.substring_view(encoding_start, encoding_length); + + for (char c : encoding_view) { + u8 byte = static_cast(c); + if ((byte < 0x01 || byte > 0x21) && (byte < 0x23 || byte > 0x7F)) { + return {}; + } + } + + return TextCodec::get_standardized_encoding(encoding_view); + }; + // If the return value was utf-16be or utf-16le, return utf-8; if it was anything else except failure, return it. + auto byte_stream_value = check_stylesheets_byte_stream(); + if (byte_stream_value.has_value() && (byte_stream_value == "UTF-16BE"sv || byte_stream_value == "UTF-16LE")) + return "utf-8"sv; + if (byte_stream_value.has_value()) + return byte_stream_value.value(); + + // 3. Otherwise, if an environment encoding is provided by the referring document, return it. + if (environment_encoding.has_value()) + return environment_encoding.value(); + + // 4. Otherwise, return utf-8. + return "utf-8"sv; + }; + + // 1. Determine the fallback encoding of stylesheet, and let fallback be the result. + auto fallback = determine_the_fallback_encoding(); + auto decoder = TextCodec::decoder_for(fallback); + if (!decoder.has_value()) { + // If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not. + dbgln("FIXME: Style sheet encoding '{}' is not supported yet", fallback); + return Error::from_string_literal("No Decoder found"); + } + // 2. Decode stylesheet’s stream of bytes with fallback encoding fallback, and return the result. + return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string); +} + } diff --git a/Libraries/LibWeb/CSS/Parser/Parser.h b/Libraries/LibWeb/CSS/Parser/Parser.h index 89ee489a775..29c3ad96337 100644 --- a/Libraries/LibWeb/CSS/Parser/Parser.h +++ b/Libraries/LibWeb/CSS/Parser/Parser.h @@ -602,5 +602,6 @@ Vector> parse_media_query_list(CSS::Parser::Parsi RefPtr parse_css_supports(CSS::Parser::ParsingParams const&, StringView); Vector parse_component_values_list(CSS::Parser::ParsingParams const&, StringView); GC::Ref internal_css_realm(); +ErrorOr css_decode_bytes(Optional const& environment_encoding, Optional mime_type_charset, ByteBuffer const& encoded_string); } diff --git a/Libraries/LibWeb/HTML/HTMLLinkElement.cpp b/Libraries/LibWeb/HTML/HTMLLinkElement.cpp index f170a16dd4c..0905ef6cfdd 100644 --- a/Libraries/LibWeb/HTML/HTMLLinkElement.cpp +++ b/Libraries/LibWeb/HTML/HTMLLinkElement.cpp @@ -422,10 +422,13 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru { // 1. If the resource's Content-Type metadata is not text/css, then set success to false. auto mime_type_string = m_mime_type; - if (!mime_type_string.has_value()) { - auto extracted_mime_type = response.header_list()->extract_mime_type(); - if (extracted_mime_type.has_value()) + Optional mime_type_charset; + auto extracted_mime_type = response.header_list()->extract_mime_type(); + if (extracted_mime_type.has_value()) { + if (!mime_type_string.has_value()) mime_type_string = extracted_mime_type->essence(); + if (auto charset = extracted_mime_type->parameters().get("charset"sv); charset.has_value()) + mime_type_charset = charset.value(); } if (mime_type_string.has_value() && mime_type_string != "text/css"sv) { @@ -469,43 +472,34 @@ void HTMLLinkElement::process_stylesheet_resource(bool success, Fetch::Infrastru // The CSS environment encoding is the result of running the following steps: [CSSSYNTAX] // 1. If the element has a charset attribute, get an encoding from that attribute's value. If that succeeds, return the resulting encoding. [ENCODING] // 2. Otherwise, return the document's character encoding. [DOM] + Optional environment_encoding; + if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) { + if (auto environment_encoding = TextCodec::get_standardized_encoding(charset.release_value()); environment_encoding.has_value()) + environment_encoding = environment_encoding.value(); + } + if (!environment_encoding.has_value() && document().encoding().has_value()) + environment_encoding = document().encoding().value(); - Optional encoding; - if (auto charset = attribute(HTML::AttributeNames::charset); charset.has_value()) - encoding = charset.release_value(); - - if (!encoding.has_value()) - encoding = document().encoding_or_default(); - - auto decoder = TextCodec::decoder_for(*encoding); - - if (!decoder.has_value()) { - // If we don't support the encoding yet, let's error out instead of trying to decode it as something it's most likely not. - dbgln("FIXME: Style sheet encoding '{}' is not supported yet", encoding); + auto maybe_decoded_string = css_decode_bytes(environment_encoding, mime_type_charset, body_bytes.get()); + if (maybe_decoded_string.is_error()) { + dbgln("Failed to decode CSS file: {}", response.url().value_or(URL::URL())); dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error)); } else { - auto const& encoded_string = body_bytes.get(); - auto maybe_decoded_string = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, encoded_string); - if (maybe_decoded_string.is_error()) { - dbgln("Style sheet {} claimed to be '{}' but decoding failed", response.url().value_or(URL::URL()), encoding); - dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::error)); - } else { - VERIFY(!response.url_list().is_empty()); - m_loaded_style_sheet = document_or_shadow_root_style_sheets().create_a_css_style_sheet( - maybe_decoded_string.release_value(), - "text/css"_string, - this, - attribute(HTML::AttributeNames::media).value_or({}), - in_a_document_tree() ? attribute(HTML::AttributeNames::title).value_or({}) : String {}, - (m_relationship & Relationship::Alternate && !m_explicitly_enabled) ? CSS::StyleSheetList::Alternate::Yes : CSS::StyleSheetList::Alternate::No, - CSS::StyleSheetList::OriginClean::Yes, - response.url_list().first(), - nullptr, - nullptr); + VERIFY(!response.url_list().is_empty()); + m_loaded_style_sheet = document_or_shadow_root_style_sheets().create_a_css_style_sheet( + maybe_decoded_string.release_value(), + "text/css"_string, + this, + attribute(HTML::AttributeNames::media).value_or({}), + in_a_document_tree() ? attribute(HTML::AttributeNames::title).value_or({}) : String {}, + (m_relationship & Relationship::Alternate && !m_explicitly_enabled) ? CSS::StyleSheetList::Alternate::Yes : CSS::StyleSheetList::Alternate::No, + CSS::StyleSheetList::OriginClean::Yes, + response.url_list().first(), + nullptr, + nullptr); - // 2. Fire an event named load at el. - dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load)); - } + // 2. Fire an event named load at el. + dispatch_event(*DOM::Event::create(realm(), HTML::EventNames::load)); } } // 5. Otherwise, fire an event named error at el. diff --git a/Libraries/LibWeb/HTML/HTMLScriptElement.cpp b/Libraries/LibWeb/HTML/HTMLScriptElement.cpp index d4221b712e8..fad1e73e6f4 100644 --- a/Libraries/LibWeb/HTML/HTMLScriptElement.cpp +++ b/Libraries/LibWeb/HTML/HTMLScriptElement.cpp @@ -242,51 +242,54 @@ void HTMLScriptElement::prepare_script() // then set el's type to "importmap". m_script_type = ScriptType::ImportMap; } - // 12. Otherwise, return. (No script is executed, and el's type is left as null.) + // FIXME: 12. Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules". + // 13. Otherwise, return. (No script is executed, and el's type is left as null.) else { VERIFY(m_script_type == ScriptType::Null); return; } - // 13. If parser document is non-null, then set el's parser document back to parser document and set el's force async to false. + // 14. If parser document is non-null, then set el's parser document back to parser document and set el's force async to false. if (parser_document) { m_parser_document = parser_document; m_force_async = false; } - // 14. Set el's already started to true. + // 15. Set el's already started to true. m_already_started = true; - // 15. Set el's preparation-time document to its node document. + // 16. Set el's preparation-time document to its node document. m_preparation_time_document = &document(); - // 16. If parser document is non-null, and parser document is not equal to el's preparation-time document, then return. + // 17. If parser document is non-null, and parser document is not equal to el's preparation-time document, then return. if (parser_document != nullptr && parser_document != m_preparation_time_document) { dbgln("HTMLScriptElement: Refusing to run script because the parser document is not the same as the preparation time document."); return; } - // 17. If scripting is disabled for el, then return. + // 18. If scripting is disabled for el, then return. if (is_scripting_disabled()) { dbgln("HTMLScriptElement: Refusing to run script because scripting is disabled."); return; } - // 18. If el has a nomodule content attribute and its type is "classic", then return. + // 19. If el has a nomodule content attribute and its type is "classic", then return. if (m_script_type == ScriptType::Classic && has_attribute(HTML::AttributeNames::nomodule)) { dbgln("HTMLScriptElement: Refusing to run classic script because it has the nomodule attribute."); return; } - // 19. If el does not have a src content attribute, and the Should element's inline behavior be blocked by Content Security Policy? - // algorithm returns "Blocked" when given el, "script", and source text, then return. [CSP] + // FIXME: 20. Let cspType be "script speculationrules" if el's type is "speculationrules"; otherwise, "script". + + // 21. If el does not have a src content attribute, and the Should element's inline behavior be blocked by Content + // Security Policy? algorithm returns "Blocked" when given el, cspType, and source text, then return [CSP] if (!has_attribute(AttributeNames::src) && ContentSecurityPolicy::should_elements_inline_type_behavior_be_blocked_by_content_security_policy(realm(), *this, ContentSecurityPolicy::Directives::Directive::InlineType::Script, source_text_utf8) == ContentSecurityPolicy::Directives::Directive::Result::Blocked) { dbgln("HTMLScriptElement: Refusing to run inline script because it violates the Content Security Policy."); return; } - // 20. If el has an event attribute and a for attribute, and el's type is "classic", then: + // 22. If el has an event attribute and a for attribute, and el's type is "classic", then: if (m_script_type == ScriptType::Classic && has_attribute(HTML::AttributeNames::event) && has_attribute(HTML::AttributeNames::for_)) { // 1. Let for be the value of el's' for attribute. auto for_ = get_attribute_value(HTML::AttributeNames::for_); @@ -312,7 +315,7 @@ void HTMLScriptElement::prepare_script() } } - // 21. If el has a charset attribute, then let encoding be the result of getting an encoding from the value of the charset attribute. + // 23. If el has a charset attribute, then let encoding be the result of getting an encoding from the value of the charset attribute. // If el does not have a charset attribute, or if getting an encoding failed, then let encoding be el's node document's the encoding. Optional encoding; @@ -328,34 +331,34 @@ void HTMLScriptElement::prepare_script() VERIFY(encoding.has_value()); - // 22. Let classic script CORS setting be the current state of el's crossorigin content attribute. + // 24. Let classic script CORS setting be the current state of el's crossorigin content attribute. auto classic_script_cors_setting = m_crossorigin; - // 23. Let module script credentials mode be the CORS settings attribute credentials mode for el's crossorigin content attribute. + // 25. Let module script credentials mode be the CORS settings attribute credentials mode for el's crossorigin content attribute. auto module_script_credential_mode = cors_settings_attribute_credentials_mode(m_crossorigin); - // 24. Let cryptographic nonce be el's [[CryptographicNonce]] internal slot's value. + // 26. Let cryptographic nonce be el's [[CryptographicNonce]] internal slot's value. auto cryptographic_nonce = m_cryptographic_nonce; - // 25. If el has an integrity attribute, then let integrity metadata be that attribute's value. + // 27. If el has an integrity attribute, then let integrity metadata be that attribute's value. // Otherwise, let integrity metadata be the empty string. String integrity_metadata; if (auto maybe_integrity = attribute(HTML::AttributeNames::integrity); maybe_integrity.has_value()) { integrity_metadata = *maybe_integrity; } - // 26. Let referrer policy be the current state of el's referrerpolicy content attribute. + // 28. Let referrer policy be the current state of el's referrerpolicy content attribute. auto referrer_policy = m_referrer_policy; - // 27. Let fetch priority be the current state of el's fetchpriority content attribute. + // 29. Let fetch priority be the current state of el's fetchpriority content attribute. auto fetch_priority = Fetch::Infrastructure::request_priority_from_string(get_attribute_value(HTML::AttributeNames::fetchpriority)).value_or(Fetch::Infrastructure::Request::Priority::Auto); - // 28. Let parser metadata be "parser-inserted" if el is parser-inserted, and "not-parser-inserted" otherwise. + // 30. Let parser metadata be "parser-inserted" if el is parser-inserted, and "not-parser-inserted" otherwise. auto parser_metadata = is_parser_inserted() ? Fetch::Infrastructure::Request::ParserMetadata::ParserInserted : Fetch::Infrastructure::Request::ParserMetadata::NotParserInserted; - // 29. Let options be a script fetch options whose cryptographic nonce is cryptographic nonce, + // 31. Let options be a script fetch options whose cryptographic nonce is cryptographic nonce, // integrity metadata is integrity metadata, parser metadata is parser metadata, // credentials mode is module script credentials mode, referrer policy is referrer policy, // and fetch priority is fetch priority. @@ -368,12 +371,13 @@ void HTMLScriptElement::prepare_script() .fetch_priority = move(fetch_priority), }; - // 30. Let settings object be el's node document's relevant settings object. + // 32. Let settings object be el's node document's relevant settings object. auto& settings_object = document().relevant_settings_object(); - // 31. If el has a src content attribute, then: + // 33. If el has a src content attribute, then: if (has_attribute(HTML::AttributeNames::src)) { - // 1. If el's type is "importmap", + // 1. If el's type is "importmap" or "speculationrules", then: + // FIXME: Add "speculationrules" support. if (m_script_type == ScriptType::ImportMap) { // then queue an element task on the DOM manipulation task source given el to fire an event named error at el, and return. queue_an_element_task(HTML::Task::Source::DOMManipulation, [this] { @@ -445,9 +449,9 @@ void HTMLScriptElement::prepare_script() } } - // 32. If el does not have a src content attribute: + // 34. If el does not have a src content attribute: if (!has_attribute(HTML::AttributeNames::src)) { - // Let base URL be el's node document's document base URL. + // 1. Let base URL be el's node document's document base URL. auto base_url = document().base_url(); // 2. Switch on el's type: @@ -466,11 +470,14 @@ void HTMLScriptElement::prepare_script() begin_delaying_document_load_event(*m_preparation_time_document); auto steps = create_on_fetch_script_complete(heap(), [this](auto result) { - // 1. Mark as ready el given result. - if (!result) - mark_as_ready(ResultState::Null {}); - else - mark_as_ready(Result(*result)); + // 1. Queue an element task on the networking task source given el to perform the following steps: + queue_an_element_task(Task::Source::Networking, [this, result = move(result)] { + // 1. Mark as ready el given result. + if (!result) + mark_as_ready(ResultState::Null {}); + else + mark_as_ready(Result(*result)); + }); }); // 2. Fetch an inline module script graph, given source text, base URL, settings object, options, and with the following steps given result: @@ -485,9 +492,10 @@ void HTMLScriptElement::prepare_script() // 2. Mark as ready el given result. mark_as_ready(Result(move(result))); } + // FIXME: -> "speculationrules" } - // 33. If el's type is "classic" and el has a src attribute, or el's type is "module": + // 35. If el's type is "classic" and el has a src attribute, or el's type is "module": if ((m_script_type == ScriptType::Classic && has_attribute(HTML::AttributeNames::src)) || m_script_type == ScriptType::Module) { // 1. Assert: el's result is "uninitialized". // FIXME: I believe this step to be a spec bug, and it should be removed: https://github.com/whatwg/html/issues/8534 @@ -561,7 +569,7 @@ void HTMLScriptElement::prepare_script() } } - // 34. Otherwise: + // 36. Otherwise: else { // 1. Assert: el's result is not "uninitialized". VERIFY(!m_result.has()); diff --git a/Tests/LibRegex/TestRegex.cpp b/Tests/LibRegex/TestRegex.cpp index 50bd974f349..0bbe5f10861 100644 --- a/Tests/LibRegex/TestRegex.cpp +++ b/Tests/LibRegex/TestRegex.cpp @@ -1379,3 +1379,93 @@ TEST_CASE(account_for_opcode_size_calculating_incoming_jump_edges) EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv); } } + +TEST_CASE(backreference_to_undefined_capture_groups) +{ + { + // Test duplicate named groups in alternatives where backreference refers to participating group + Regex re("(?:(?a)|(?b))\\k"sv); + auto result = re.match("bb"sv); + + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.matches.size(), 1u); + EXPECT_EQ(result.matches.first().view.to_byte_string(), "bb"sv); + EXPECT_EQ(result.capture_group_matches.first().size(), 2u); + EXPECT(result.capture_group_matches.first()[0].view.is_null()); + EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "b"sv); + } + + { + // Test duplicate named groups with quantifier + Regex re("(?:(?:(?a)|(?b))\\k){2}"sv); + auto result = re.match("aabb"sv); + + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.matches.size(), 1u); + EXPECT_EQ(result.matches.first().view.to_byte_string(), "aabb"sv); + EXPECT_EQ(result.capture_group_matches.first().size(), 2u); + EXPECT(result.capture_group_matches.first()[0].view.is_null()); + EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "b"sv); + } + + { + // Test that first alternative works too + Regex re("(?:(?a)|(?b))\\k"sv); + auto result = re.match("aa"sv); + + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.matches.size(), 1u); + EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv); + EXPECT_EQ(result.capture_group_matches.first().size(), 2u); + EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "a"sv); + EXPECT(result.capture_group_matches.first()[1].view.is_null()); + } + + { + // Test numbered backreference to undefined group + Regex re("(.*?)a(?!(a+)b\\2c)\\2(.*)"sv); + auto result = re.match("baaabaac"sv); + + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.matches.size(), 1u); + EXPECT_EQ(result.matches.first().view.to_byte_string(), "baaabaac"sv); + EXPECT_EQ(result.capture_group_matches.first().size(), 3u); + EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "ba"sv); + EXPECT(result.capture_group_matches.first()[1].view.is_null()); + EXPECT_EQ(result.capture_group_matches.first()[2].view.to_byte_string(), "abaac"sv); + } + + { + Regex re("^(?:(?x)|(?y)|z)\\k$"sv); + + // Third alternative matches and backreference is undefined + auto result1 = re.match("z"sv); + EXPECT_EQ(result1.success, true); + EXPECT_EQ(result1.matches.size(), 1u); + EXPECT_EQ(result1.matches.first().view.to_byte_string(), "z"sv); + EXPECT_EQ(result1.capture_group_matches.first().size(), 2u); + EXPECT(result1.capture_group_matches.first()[0].view.is_null()); + EXPECT(result1.capture_group_matches.first()[1].view.is_null()); + } + + { + // Quantified version of the above pattern + Regex re("^(?:(?x)|(?y)|z){2}\\k$"sv); + + auto result1 = re.match("xz"sv); + EXPECT_EQ(result1.success, true); + EXPECT_EQ(result1.matches.size(), 1u); + EXPECT_EQ(result1.matches.first().view.to_byte_string(), "xz"sv); + EXPECT_EQ(result1.capture_group_matches.first().size(), 2u); + EXPECT(result1.capture_group_matches.first()[0].view.is_null()); + EXPECT(result1.capture_group_matches.first()[1].view.is_null()); + + auto result2 = re.match("yz"sv); + EXPECT_EQ(result2.success, true); + EXPECT_EQ(result2.matches.size(), 1u); + EXPECT_EQ(result2.matches.first().view.to_byte_string(), "yz"sv); + EXPECT_EQ(result2.capture_group_matches.first().size(), 2u); + EXPECT(result2.capture_group_matches.first()[0].view.is_null()); + EXPECT(result2.capture_group_matches.first()[1].view.is_null()); + } +} diff --git a/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/at-charset-077.xht b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/at-charset-077.xht new file mode 100644 index 00000000000..141b7b567f1 --- /dev/null +++ b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/at-charset-077.xht @@ -0,0 +1,19 @@ + + + + + CSS Test: Stylesheet encodings: KOI8-R + + + + + + + + + +

This should have a green background.

+ + diff --git a/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/support/at-charset-077.css b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/support/at-charset-077.css new file mode 100644 index 00000000000..9a978061caf --- /dev/null +++ b/Tests/LibWeb/Ref/input/wpt-import/css/CSS2/syntax/support/at-charset-077.css @@ -0,0 +1,2 @@ +@charset "koi8-r"; +.tést { color: white; background: green; } \ No newline at end of file diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/semantics/scripting-1/the-script-element/execution-timing/non-external-no-import.txt b/Tests/LibWeb/Text/expected/wpt-import/html/semantics/scripting-1/the-script-element/execution-timing/non-external-no-import.txt new file mode 100644 index 00000000000..5558893d7be --- /dev/null +++ b/Tests/LibWeb/Text/expected/wpt-import/html/semantics/scripting-1/the-script-element/execution-timing/non-external-no-import.txt @@ -0,0 +1,6 @@ +Harness status: OK + +Found 1 tests + +1 Pass +Pass Module scripts with no imports always execute asynchronously \ No newline at end of file diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.txt b/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.txt new file mode 100644 index 00000000000..5c1440a139a --- /dev/null +++ b/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.txt @@ -0,0 +1,6 @@ +Harness status: OK + +Found 1 tests + +1 Pass +Pass The character encoding of the page can be set by a meta element with charset attribute. \ No newline at end of file diff --git a/Tests/LibWeb/Text/input/wpt-import/html/semantics/scripting-1/the-script-element/execution-timing/non-external-no-import.html b/Tests/LibWeb/Text/input/wpt-import/html/semantics/scripting-1/the-script-element/execution-timing/non-external-no-import.html new file mode 100644 index 00000000000..5d801fb815f --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/html/semantics/scripting-1/the-script-element/execution-timing/non-external-no-import.html @@ -0,0 +1,27 @@ + + + + Module scripts with no imports always execute asynchronously + + + + + + + + diff --git a/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/support/encodingtests-15.css b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/support/encodingtests-15.css new file mode 100644 index 00000000000..ec907a1a94e --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/support/encodingtests-15.css @@ -0,0 +1,4 @@ +@charset "utf-8"; +.test div.ÜÀÚ { + width: 100px; +} diff --git a/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.html b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.html new file mode 100644 index 00000000000..93e83b7cf62 --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/html/syntax/parsing-html-fragments/the-input-byte-stream-009.html @@ -0,0 +1,37 @@ + + + + meta charset attribute + + + + + + + + + + + + +
 
+ + + + + +
+ + +