/* * Copyright (c) 2020, Emanuel Sprung * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include "RegexBytecodeStreamOptimizer.h" #include "RegexMatch.h" #include #include #include #include #include #include #include #include #include #include #include namespace regex { using ByteCodeValueType = u64; #define ENUMERATE_OPCODES \ __ENUMERATE_OPCODE(Compare) \ __ENUMERATE_OPCODE(Jump) \ __ENUMERATE_OPCODE(JumpNonEmpty) \ __ENUMERATE_OPCODE(ForkJump) \ __ENUMERATE_OPCODE(ForkStay) \ __ENUMERATE_OPCODE(ForkReplaceJump) \ __ENUMERATE_OPCODE(ForkReplaceStay) \ __ENUMERATE_OPCODE(ForkIf) \ __ENUMERATE_OPCODE(FailForks) \ __ENUMERATE_OPCODE(PopSaved) \ __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ __ENUMERATE_OPCODE(RSeekTo) \ __ENUMERATE_OPCODE(CheckBegin) \ __ENUMERATE_OPCODE(CheckEnd) \ __ENUMERATE_OPCODE(CheckBoundary) \ __ENUMERATE_OPCODE(Save) \ __ENUMERATE_OPCODE(Restore) \ __ENUMERATE_OPCODE(GoBack) \ __ENUMERATE_OPCODE(SetStepBack) \ __ENUMERATE_OPCODE(IncStepBack) \ __ENUMERATE_OPCODE(CheckStepBack) \ __ENUMERATE_OPCODE(CheckSavedPosition) \ __ENUMERATE_OPCODE(ClearCaptureGroup) \ __ENUMERATE_OPCODE(FailIfEmpty) \ __ENUMERATE_OPCODE(Repeat) \ __ENUMERATE_OPCODE(ResetRepeat) \ __ENUMERATE_OPCODE(Checkpoint) \ __ENUMERATE_OPCODE(CompareSimple) \ __ENUMERATE_OPCODE(SaveModifiers) \ __ENUMERATE_OPCODE(RestoreModifiers) \ __ENUMERATE_OPCODE(Exit) // clang-format off enum class OpCodeId : ByteCodeValueType { #define __ENUMERATE_OPCODE(x) x, ENUMERATE_OPCODES #undef __ENUMERATE_OPCODE First = Compare, Last = Exit, }; // clang-format on #define ENUMERATE_CHARACTER_COMPARE_TYPES \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(And) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Subtract) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(StringSet) enum class CharacterCompareType : ByteCodeValueType { #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x, ENUMERATE_CHARACTER_COMPARE_TYPES #undef __ENUMERATE_CHARACTER_COMPARE_TYPE }; #define ENUMERATE_CHARACTER_CLASSES \ __ENUMERATE_CHARACTER_CLASS(Alnum) \ __ENUMERATE_CHARACTER_CLASS(Cntrl) \ __ENUMERATE_CHARACTER_CLASS(Lower) \ __ENUMERATE_CHARACTER_CLASS(Space) \ __ENUMERATE_CHARACTER_CLASS(Alpha) \ __ENUMERATE_CHARACTER_CLASS(Digit) \ __ENUMERATE_CHARACTER_CLASS(Print) \ __ENUMERATE_CHARACTER_CLASS(Upper) \ __ENUMERATE_CHARACTER_CLASS(Blank) \ __ENUMERATE_CHARACTER_CLASS(Graph) \ __ENUMERATE_CHARACTER_CLASS(Punct) \ __ENUMERATE_CHARACTER_CLASS(Word) \ __ENUMERATE_CHARACTER_CLASS(Xdigit) enum class CharClass : ByteCodeValueType { #define __ENUMERATE_CHARACTER_CLASS(x) x, ENUMERATE_CHARACTER_CLASSES #undef __ENUMERATE_CHARACTER_CLASS }; #define ENUMERATE_BOUNDARY_CHECK_TYPES \ __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \ __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord) enum class BoundaryCheckType : ByteCodeValueType { #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x, ENUMERATE_BOUNDARY_CHECK_TYPES #undef __ENUMERATE_BOUNDARY_CHECK_TYPE }; #define ENUMERATE_FORK_IF_CONDITIONS \ __ENUMERATE_FORK_IF_CONDITION(AtStartOfLine) \ __ENUMERATE_FORK_IF_CONDITION(Invalid) /* Must be last */ enum class ForkIfCondition : ByteCodeValueType { #define __ENUMERATE_FORK_IF_CONDITION(x) x, ENUMERATE_FORK_IF_CONDITIONS #undef __ENUMERATE_FORK_IF_CONDITION }; struct CharRange { u32 from; u32 to; CharRange(u64 value) : from(value >> 32) , to(value & 0xffffffff) { } CharRange(u32 from, u32 to) : from(from) , to(to) { } operator ByteCodeValueType() const { return ((u64)from << 32) | to; } }; struct CompareTypeAndValuePair { CharacterCompareType type; ByteCodeValueType value; }; REGEX_API extern u32 s_next_string_table_serial; template struct StringTable { StringTable() : m_serial(s_next_string_table_serial++) { } ~StringTable() { if (m_serial != 0) { if (m_serial == s_next_string_table_serial - 1 && m_table.is_empty()) --s_next_string_table_serial; // We didn't use this serial, put it back. } } StringTable(StringTable const& other) { // Pull a new serial for this copy m_serial = s_next_string_table_serial++; m_table = other.m_table; m_inverse_table = other.m_inverse_table; } StringTable(StringTable&& other) { m_serial = other.m_serial; m_table = move(other.m_table); m_inverse_table = move(other.m_inverse_table); // Clear other's data to avoid double-deletion of serial other.m_serial = 0; } StringTable& operator=(StringTable const& other) { if (this != &other) { m_serial = s_next_string_table_serial++; m_table = other.m_table; m_inverse_table = other.m_inverse_table; } return *this; } StringTable& operator=(StringTable&& other) { if (this != &other) { m_serial = other.m_serial; m_table = move(other.m_table); m_inverse_table = move(other.m_inverse_table); // Clear other's data to avoid double-deletion of serial other.m_serial = 0; } return *this; } ByteCodeValueType set(StringType string) { u32 local_index = m_table.size() + 0x4242; ByteCodeValueType global_index; if (auto maybe_local_index = m_table.get(string); maybe_local_index.has_value()) { local_index = maybe_local_index.value(); global_index = static_cast(m_serial) << 32 | static_cast(local_index); } else { global_index = static_cast(m_serial) << 32 | static_cast(local_index); m_table.set(string, global_index); m_inverse_table.set(global_index, string); } return global_index; } StringType get(ByteCodeValueType index) const { return m_inverse_table.get(index).value(); } u32 m_serial { 0 }; HashMap m_table; HashMap m_inverse_table; }; using StringSetTrie = Trie; struct REGEX_API StringSetTable { StringSetTable(); ~StringSetTable(); StringSetTable(StringSetTable const& other); StringSetTable(StringSetTable&&) = default; StringSetTable& operator=(StringSetTable const& other); StringSetTable& operator=(StringSetTable&&) = default; ByteCodeValueType set(Vector const& strings) { u32 local_index = m_u8_tries.size(); ByteCodeValueType global_index = static_cast(m_serial) << 32 | static_cast(local_index); StringSetTrie u8_trie { 0, false }; StringSetTrie u16_trie { 0, false }; for (auto const& str : strings) { Vector code_points; Utf8View utf8_view { str.bytes_as_string_view() }; for (auto code_point : utf8_view) code_points.append(code_point); (void)u8_trie.insert(code_points.begin(), code_points.end(), true, [](auto&, auto) { return false; }); auto utf16_string = Utf16String::from_utf32({ code_points.data(), code_points.size() }); Vector u16_code_units; auto utf16_view = utf16_string.utf16_view(); for (size_t i = 0; i < utf16_view.length_in_code_units(); i++) { auto code_unit = utf16_view.code_unit_at(i); u16_code_units.append(code_unit); } (void)u16_trie.insert(u16_code_units.begin(), u16_code_units.end(), true, [](auto&, auto) { return false; }); } m_u8_tries.set(global_index, move(u8_trie)); m_u16_tries.set(global_index, move(u16_trie)); return global_index; } StringSetTrie const& get_u8_trie(ByteCodeValueType index) const { return m_u8_tries.get(index).value(); } StringSetTrie const& get_u16_trie(ByteCodeValueType index) const { return m_u16_tries.get(index).value(); } u32 m_serial { 0 }; HashMap m_u8_tries; HashMap m_u16_tries; }; struct ByteCodeBase { FlyString get_string(size_t index) const { return m_string_table.get(index); } auto const& string_table() const { return m_string_table; } auto get_u16_string(size_t index) const { return m_u16_string_table.get(index); } auto const& u16_string_table() const { return m_u16_string_table; } auto const& string_set_table() const { return m_string_set_table; } auto& string_set_table() { return m_string_set_table; } Optional get_group_name_index(size_t group_index) const { return m_group_name_mappings.get(group_index); } protected: StringTable m_string_table; StringTable m_u16_string_table; StringSetTable m_string_set_table; HashMap m_group_name_mappings; }; class REGEX_API ByteCode : public ByteCodeBase , public DisjointChunks { using Base = DisjointChunks; friend class FlatByteCode; public: using Base::append; ByteCode() { ensure_opcodes_initialized(); } ByteCode(ByteCode const&) = default; ByteCode(ByteCode&&) = default; ByteCode(Base&&) = delete; ByteCode(Base const&) = delete; ~ByteCode() = default; ByteCode& operator=(ByteCode const&) = default; ByteCode& operator=(ByteCode&&) = default; ByteCode& operator=(Base&& value) = delete; ByteCode& operator=(Base const& value) = delete; void extend(ByteCode&& other) { merge_string_tables_from({ &other, 1 }); Base::extend(move(other)); } void extend(ByteCode const& other) { merge_string_tables_from({ &other, 1 }); Base::extend(other); } template> T> void extend(T other) { Base::append(move(other)); } template void empend(Args&&... args) { if (is_empty()) Base::append({}); Base::last_chunk().empend(forward(args)...); } template void append(T&& value) { if (is_empty()) Base::append({}); Base::last_chunk().append(forward(value)); } template void prepend(T&& value) { if (is_empty()) return append(forward(value)); Base::first_chunk().prepend(forward(value)); } void append(Span value) { if (is_empty()) Base::append({}); auto& last = Base::last_chunk(); last.ensure_capacity(value.size()); for (auto v : value) last.unchecked_append(v); } void ensure_capacity(size_t capacity) { if (is_empty()) Base::append({}); Base::last_chunk().ensure_capacity(capacity); } void last_chunk() const = delete; void first_chunk() const = delete; void merge_string_tables_from(Span others) { for (auto const& other : others) { for (auto const& entry : other.m_string_table.m_table) { auto const result = m_string_table.m_inverse_table.set(entry.value, entry.key); if (result != HashSetResult::InsertedNewEntry) { if (m_string_table.m_inverse_table.get(entry.value) == entry.key) // Already in inverse table. continue; dbgln("StringTable: Detected ID clash in string tables! ID {} seems to be reused", entry.value); dbgln("Old: {}, New: {}", m_string_table.m_inverse_table.get(entry.value), entry.key); VERIFY_NOT_REACHED(); } m_string_table.m_table.set(entry.key, entry.value); } m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table); for (auto const& entry : other.m_u16_string_table.m_table) { auto const result = m_u16_string_table.m_inverse_table.set(entry.value, entry.key); if (result != HashSetResult::InsertedNewEntry) { if (m_u16_string_table.m_inverse_table.get(entry.value) == entry.key) // Already in inverse table. continue; dbgln("StringTable: Detected ID clash in string tables! ID {} seems to be reused", entry.value); dbgln("Old: {}, New: {}", m_u16_string_table.m_inverse_table.get(entry.value), entry.key); VERIFY_NOT_REACHED(); } m_u16_string_table.m_table.set(entry.key, entry.value); } m_u16_string_table.m_inverse_table.update(other.m_u16_string_table.m_inverse_table); for (auto const& entry : other.m_string_set_table.m_u8_tries) { m_string_set_table.m_u8_tries.set(entry.key, MUST(const_cast(entry.value).deep_copy())); } for (auto const& entry : other.m_string_set_table.m_u16_tries) { m_string_set_table.m_u16_tries.set(entry.key, MUST(const_cast(entry.value).deep_copy())); } for (auto const& mapping : other.m_group_name_mappings) { m_group_name_mappings.set(mapping.key, mapping.value); } } } void insert_bytecode_compare_values(Vector&& pairs) { Optimizer::append_character_class(*this, move(pairs)); } void insert_bytecode_check_boundary(BoundaryCheckType type) { ByteCode bytecode; bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary); bytecode.empend((ByteCodeValueType)type); extend(move(bytecode)); } void insert_bytecode_clear_capture_group(size_t index) { empend(static_cast(OpCodeId::ClearCaptureGroup)); empend(index); } void insert_bytecode_compare_string(Utf16FlyString string) { empend(static_cast(OpCodeId::Compare)); empend(static_cast(1)); // number of arguments empend(static_cast(2)); // size of arguments empend(static_cast(CharacterCompareType::String)); auto index = m_u16_string_table.set(move(string)); empend(index); } void insert_bytecode_group_capture_left(size_t capture_groups_count) { empend(static_cast(OpCodeId::SaveLeftCaptureGroup)); empend(capture_groups_count); } void insert_bytecode_group_capture_right(size_t capture_groups_count) { empend(static_cast(OpCodeId::SaveRightCaptureGroup)); empend(capture_groups_count); } void insert_bytecode_group_capture_right(size_t capture_groups_count, FlyString name) { empend(static_cast(OpCodeId::SaveRightNamedCaptureGroup)); auto name_string_index = m_string_table.set(move(name)); empend(name_string_index); empend(capture_groups_count); m_group_name_mappings.set(capture_groups_count - 1, name_string_index); } void insert_bytecode_save_modifiers(FlagsUnderlyingType new_modifiers) { empend(static_cast(OpCodeId::SaveModifiers)); empend(static_cast(new_modifiers)); } void insert_bytecode_restore_modifiers() { empend(static_cast(OpCodeId::RestoreModifiers)); } enum class LookAroundType { LookAhead, LookBehind, NegatedLookAhead, NegatedLookBehind, }; void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0, bool greedy_lookaround = true) { // FIXME: The save stack will grow infinitely with repeated failures // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture). switch (type) { case LookAroundType::LookAhead: { // SAVE // FORKJUMP _BODY // POPSAVED // LABEL _BODY // REGEXP BODY // RESTORE empend((ByteCodeValueType)OpCodeId::Save); empend((ByteCodeValueType)OpCodeId::ForkJump); empend((ByteCodeValueType)1); empend((ByteCodeValueType)OpCodeId::PopSaved); extend(move(lookaround_body)); empend((ByteCodeValueType)OpCodeId::Restore); return; } case LookAroundType::NegatedLookAhead: { // JUMP _A // LABEL _L // REGEXP BODY // FAIL // LABEL _A // SAVE // FORKJUMP _L // RESTORE auto body_length = lookaround_body.size(); empend((ByteCodeValueType)OpCodeId::Jump); empend((ByteCodeValueType)body_length + 1); // JUMP to label _A extend(move(lookaround_body)); empend((ByteCodeValueType)OpCodeId::FailForks); empend((ByteCodeValueType)OpCodeId::Save); empend((ByteCodeValueType)OpCodeId::ForkJump); empend((ByteCodeValueType) - (body_length + 4)); // JUMP to label _L empend((ByteCodeValueType)OpCodeId::Restore); return; } case LookAroundType::LookBehind: { // SAVE // SET_STEPBACK match_length(BODY)-1 // LABEL _START // INC_STEPBACK // FORK_JUMP _BODY // CHECK_STEPBACK // JUMP _START // LABEL _BODY // REGEX BODY // CHECK_SAVED_POSITION // RESTORE auto body_length = lookaround_body.size(); empend((ByteCodeValueType)OpCodeId::Save); empend((ByteCodeValueType)OpCodeId::SetStepBack); empend((ByteCodeValueType)match_length - 1); empend((ByteCodeValueType)OpCodeId::IncStepBack); empend((ByteCodeValueType)OpCodeId::ForkJump); empend((ByteCodeValueType)1 + 2); // JUMP to label _BODY empend((ByteCodeValueType)OpCodeId::CheckStepBack); empend((ByteCodeValueType)OpCodeId::Jump); empend((ByteCodeValueType)-6); // JUMP to label _START extend(move(lookaround_body)); if (greedy_lookaround) { empend((ByteCodeValueType)OpCodeId::ForkJump); empend((ByteCodeValueType)(0 - 2 - body_length - 6)); } empend((ByteCodeValueType)OpCodeId::CheckSavedPosition); empend((ByteCodeValueType)OpCodeId::Restore); return; } case LookAroundType::NegatedLookBehind: { // JUMP _A // LABEL _L // GOBACK match_length(BODY) // REGEXP BODY // FAIL // LABEL _A // SAVE // FORKJUMP _L // RESTORE auto body_length = lookaround_body.size(); empend((ByteCodeValueType)OpCodeId::Jump); empend((ByteCodeValueType)body_length + 3); // JUMP to label _A empend((ByteCodeValueType)OpCodeId::GoBack); empend((ByteCodeValueType)match_length); extend(move(lookaround_body)); empend((ByteCodeValueType)OpCodeId::FailForks); empend((ByteCodeValueType)OpCodeId::Save); empend((ByteCodeValueType)OpCodeId::ForkJump); empend((ByteCodeValueType) - (body_length + 6)); // JUMP to label _L empend((ByteCodeValueType)OpCodeId::Restore); return; } } VERIFY_NOT_REACHED(); } void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right) { // FORKJUMP _ALT // REGEXP ALT2 // JUMP _END // LABEL _ALT // REGEXP ALT1 // LABEL _END // Optimisation: Eliminate extra work by unifying common pre-and-postfix exprs. Optimizer::append_alternation(*this, move(left), move(right)); } template static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true) { if (!maximum.has_value()) { if (minimum == 0) return transform_bytecode_repetition_any(bytecode_to_repeat, greedy); if (minimum == 1) return transform_bytecode_repetition_min_one(bytecode_to_repeat, greedy); } if (minimum == 0 && maximum.has_value() && maximum.value() == 1) { return transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy); } ByteCode new_bytecode; new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id); if (maximum.has_value()) { // (REPEAT REGEXP MIN) // LABEL _MAX_LOOP | // FORK END | // CHECKPOINT (if min==0) | // REGEXP | // FAILIFEMPTY (if min==0) | // REPEAT _MAX_LOOP MAX-MIN | if max > min // FORK END | // CHECKPOINT (if min==0) | // REGEXP | // FAILIFEMPTY (if min==0) | // LABEL END | // RESET _MAX_LOOP | auto jump_kind = static_cast(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump); if (maximum.value() > minimum) { new_bytecode.empend(jump_kind); new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. auto pre_loop_fork_jump_index = new_bytecode.size(); auto checkpoint1 = minimum == 0 ? s_next_checkpoint_serial_id++ : 0; if (minimum == 0) { new_bytecode.empend(static_cast(OpCodeId::Checkpoint)); new_bytecode.empend(static_cast(checkpoint1)); } new_bytecode.extend(bytecode_to_repeat); if (minimum == 0) { new_bytecode.empend(static_cast(OpCodeId::FailIfEmpty)); new_bytecode.empend(checkpoint1); } auto repetitions = maximum.value() - minimum; auto fork_jump_address = new_bytecode.size(); if (repetitions > 1) { auto repeated_bytecode_size = bytecode_to_repeat.size(); if (minimum == 0) repeated_bytecode_size += 4; // Checkpoint + FailIfEmpty new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat); new_bytecode.empend(repeated_bytecode_size + 2); new_bytecode.empend(static_cast(repetitions - 1)); new_bytecode.empend(max_repetition_mark_id); new_bytecode.empend(jump_kind); new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target. auto post_loop_fork_jump_index = new_bytecode.size(); auto checkpoint2 = minimum == 0 ? s_next_checkpoint_serial_id++ : 0; if (minimum == 0) { new_bytecode.empend(static_cast(OpCodeId::Checkpoint)); new_bytecode.empend(static_cast(checkpoint2)); } new_bytecode.extend(bytecode_to_repeat); if (minimum == 0) { new_bytecode.empend(static_cast(OpCodeId::FailIfEmpty)); new_bytecode.empend(checkpoint2); } fork_jump_address = new_bytecode.size(); new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index); new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat); new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id); } new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index); } } else { // no maximum value set, repeat finding if possible: // (REPEAT REGEXP MIN) // LABEL _START // CHECKPOINT _C // REGEXP // JUMP_NONEMPTY _C _START FORK // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n) auto checkpoint = s_next_checkpoint_serial_id++; new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint); new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)checkpoint); auto jump_kind = static_cast(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay); new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty); new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 2); // Jump to the last iteration new_bytecode.empend(checkpoint); // if _C is not empty. new_bytecode.empend(jump_kind); } bytecode_to_repeat = move(new_bytecode); } template void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id) { // LABEL _LOOP // REGEXP // REPEAT _LOOP N-1 // REGEXP if (n == 0) return; // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the // REPEAT instruction forcing another loop. extend(bytecode_to_repeat); if (n > 1) { empend(static_cast(OpCodeId::Repeat)); empend(bytecode_to_repeat.size()); empend(static_cast(n - 1)); empend(repetition_mark_id); extend(bytecode_to_repeat); } } static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy) { // LABEL _START = -bytecode_to_repeat.size() // CHECKPOINT _C // REGEXP // JUMP_NONEMPTY _C _START FORKSTAY (FORKJUMP -> Greedy) auto checkpoint = s_next_checkpoint_serial_id++; bytecode_to_repeat.prepend((ByteCodeValueType)checkpoint); bytecode_to_repeat.prepend((ByteCodeValueType)OpCodeId::Checkpoint); bytecode_to_repeat.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty); bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 3); // Jump to the _START label... bytecode_to_repeat.empend(checkpoint); // ...if _C is not empty if (greedy) bytecode_to_repeat.empend(static_cast(OpCodeId::ForkJump)); else bytecode_to_repeat.empend(static_cast(OpCodeId::ForkStay)); } static void transform_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy) { // LABEL _START // FORKJUMP _END (FORKSTAY -> Greedy) // CHECKPOINT _C // REGEXP // FAILIFEMPTY _C // JUMP_NONEMPTY _C _START JUMP // LABEL _END // LABEL _START = m_bytes.size(); ByteCode bytecode; if (greedy) bytecode.empend(static_cast(OpCodeId::ForkStay)); else bytecode.empend(static_cast(OpCodeId::ForkJump)); bytecode.empend(bytecode_to_repeat.size() + 2 + 4 + 2); // Jump to the _END label auto checkpoint = s_next_checkpoint_serial_id++; bytecode.empend(static_cast(OpCodeId::Checkpoint)); bytecode.empend(static_cast(checkpoint)); bytecode.extend(bytecode_to_repeat); bytecode.empend(static_cast(OpCodeId::FailIfEmpty)); bytecode.empend(checkpoint); bytecode.empend(static_cast(OpCodeId::JumpNonEmpty)); bytecode.empend(-bytecode.size() - 3); // Jump(...) to the _START label... bytecode.empend(checkpoint); // ...only if _C passes. bytecode.empend((ByteCodeValueType)OpCodeId::Jump); // LABEL _END = bytecode.size() bytecode_to_repeat = move(bytecode); } static void transform_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy) { // FORKJUMP _END (FORKSTAY -> Greedy) // CHECKPOINT _C // REGEXP // FAILIFEMPTY _C // LABEL _END ByteCode bytecode; if (greedy) bytecode.empend(static_cast(OpCodeId::ForkStay)); else bytecode.empend(static_cast(OpCodeId::ForkJump)); bytecode.empend(bytecode_to_repeat.size() + 4); // Jump to the _END label auto checkpoint = s_next_checkpoint_serial_id++; bytecode.empend(static_cast(OpCodeId::Checkpoint)); bytecode.empend(static_cast(checkpoint)); bytecode.extend(move(bytecode_to_repeat)); bytecode.empend(static_cast(OpCodeId::FailIfEmpty)); bytecode.empend(checkpoint); // LABEL _END = bytecode.size() bytecode_to_repeat = move(bytecode); } OpCode& get_opcode(MatchState& state) const; static void reset_checkpoint_serial_id() { s_next_checkpoint_serial_id = 0; } private: void ensure_opcodes_initialized(); ALWAYS_INLINE OpCode& get_opcode_by_id(OpCodeId id) const; static OwnPtr> s_opcodes[(size_t)OpCodeId::Last + 1]; static bool s_opcodes_initialized; static size_t s_next_checkpoint_serial_id; }; class REGEX_API FlatByteCode : public ByteCodeBase { public: static FlatByteCode from(ByteCode&& bytecode) { ensure_opcodes_initialized(); FlatByteCode flat_bytecode; if (!bytecode.is_empty()) flat_bytecode.m_data = move(static_cast&>(bytecode).first_chunk()); flat_bytecode.m_string_table = move(bytecode.m_string_table); flat_bytecode.m_u16_string_table = move(bytecode.m_u16_string_table); flat_bytecode.m_string_set_table = move(bytecode.m_string_set_table); flat_bytecode.m_group_name_mappings = move(bytecode.m_group_name_mappings); return flat_bytecode; } Span flat_data() const { return m_data.span(); } OpCode& get_opcode(MatchState& state) const; auto& at(size_t index) { return m_data.data()[index]; } auto const& at(size_t index) const { return m_data.data()[index]; } auto& operator[](size_t index) { return m_data.data()[index]; } auto const& operator[](size_t index) const { return m_data.data()[index]; } auto size() const { return m_data.size(); } auto begin() const { return m_data.begin(); } auto end() const { return m_data.end(); } private: static void ensure_opcodes_initialized(); ALWAYS_INLINE OpCode& get_opcode_by_id(OpCodeId id) const; static OwnPtr> s_opcodes[(size_t)OpCodeId::Last + 1]; static bool s_opcodes_initialized; Vector m_data; }; #define ENUMERATE_EXECUTION_RESULTS \ __ENUMERATE_EXECUTION_RESULT(Continue) \ __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \ __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \ __ENUMERATE_EXECUTION_RESULT(Failed) \ __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \ __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForksButNoFurtherPossibleMatches) \ __ENUMERATE_EXECUTION_RESULT(Succeeded) enum class ExecutionResult : u8 { #define __ENUMERATE_EXECUTION_RESULT(x) x, ENUMERATE_EXECUTION_RESULTS #undef __ENUMERATE_EXECUTION_RESULT }; StringView execution_result_name(ExecutionResult result); StringView opcode_id_name(OpCodeId opcode_id); StringView boundary_check_type_name(BoundaryCheckType); StringView character_compare_type_name(CharacterCompareType result); StringView character_class_name(CharClass ch_class); StringView fork_if_condition_name(ForkIfCondition condition); template class OpCode { public: OpCode() = default; virtual ~OpCode() = default; virtual OpCodeId opcode_id() const = 0; virtual size_t size() const = 0; virtual ExecutionResult execute(MatchInput const& input, MatchState& state) const = 0; ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const { return m_bytecode->at(state().instruction_position + 1 + offset); } ALWAYS_INLINE StringView name() const { return name(opcode_id()); } static StringView name(OpCodeId); ALWAYS_INLINE void set_state(MatchState const& state) { m_state = &state; } ALWAYS_INLINE void set_bytecode(ByteCode& bytecode) { m_bytecode = &bytecode; } ALWAYS_INLINE MatchState const& state() const { return *m_state; } ByteString to_byte_string() const { return ByteString::formatted("[{:#02X}] {}", (int)opcode_id(), name(opcode_id())); } virtual ByteString arguments_string() const = 0; ALWAYS_INLINE ByteCode const& bytecode() const { return *m_bytecode; } protected: ByteCode* m_bytecode { nullptr }; MatchState const* m_state { nullptr }; }; template class REGEX_API OpCode_SaveModifiers final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveModifiers; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE FlagsUnderlyingType new_modifiers() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("new_modifiers={:#x}", new_modifiers()); } }; template class REGEX_API OpCode_RestoreModifiers final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::RestoreModifiers; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_Exit final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_FailForks final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_PopSaved final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::PopSaved; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_Save final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_Restore final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_GoBack final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t count() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("count={}", count()); } }; template class REGEX_API OpCode_SetStepBack final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SetStepBack; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE i64 step() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("step={}", step()); } }; template class REGEX_API OpCode_IncStepBack final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::IncStepBack; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::formatted("inc step back"); } }; template class REGEX_API OpCode_CheckStepBack final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckStepBack; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::formatted("check step back"); } }; template class REGEX_API OpCode_CheckSavedPosition final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckSavedPosition; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::formatted("check saved back"); } }; template class REGEX_API OpCode_Jump final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE ssize_t offset() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("offset={} [&{}]", offset(), state().instruction_position + size() + offset()); } }; template class REGEX_API OpCode_ForkJump : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE ssize_t offset() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position); } }; template class REGEX_API OpCode_ForkReplaceJump final : public OpCode_ForkJump { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; using OpCode_ForkJump::offset; using OpCode_ForkJump::size; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceJump; } }; template class REGEX_API OpCode_ForkStay : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE ssize_t offset() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position); } }; template class REGEX_API OpCode_ForkReplaceStay final : public OpCode_ForkStay { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; using OpCode_ForkStay::offset; using OpCode_ForkStay::size; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceStay; } }; template class REGEX_API OpCode_CheckBegin final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_CheckEnd final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; } ALWAYS_INLINE size_t size() const override { return 1; } ByteString arguments_string() const override { return ByteString::empty(); } }; template class REGEX_API OpCode_CheckBoundary final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t arguments_count() const { return 1; } ALWAYS_INLINE BoundaryCheckType type() const { return static_cast(argument(0)); } ByteString arguments_string() const override { return ByteString::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); } }; template class REGEX_API OpCode_ClearCaptureGroup final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t id() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } }; template class REGEX_API OpCode_FailIfEmpty final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailIfEmpty; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t checkpoint() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("checkpoint={}", checkpoint()); } }; template class REGEX_API OpCode_SaveLeftCaptureGroup final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t id() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } }; template class REGEX_API OpCode_SaveRightCaptureGroup final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t id() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } }; template class REGEX_API OpCode_SaveRightNamedCaptureGroup final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; } ALWAYS_INLINE size_t size() const override { return 3; } ALWAYS_INLINE FlyString name() const { return bytecode().get_string(name_string_table_index()); } ALWAYS_INLINE size_t name_string_table_index() const { return argument(0); } ALWAYS_INLINE size_t length() const { return name().bytes_as_string_view().length(); } ALWAYS_INLINE size_t id() const { return argument(1); } ByteString arguments_string() const override { return ByteString::formatted("name_id={}, id={}", argument(0), id()); } }; template class REGEX_API OpCode_RSeekTo final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::RSeekTo; } ALWAYS_INLINE size_t size() const override { return 2; } ByteString arguments_string() const override { auto ch = argument(0); if (ch <= 0x7f) return ByteString::formatted("before '{}'", ch); return ByteString::formatted("before u+{:04x}", argument(0)); } }; template class REGEX_API CompareInternals : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; static bool matches_character_class(CharClass, u32, bool insensitive, bool unicode_mode); Vector flat_compares() const; protected: ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE static void compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched); ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView str, bool& had_zero_length_match); ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool is_double_negation, bool& inverse_matched); ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool is_double_negation, bool& inverse_matched); ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); }; template class REGEX_API OpCode_Compare : public CompareInternals { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; using CompareInternals::flat_compares; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; } ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; } ALWAYS_INLINE size_t arguments_count() const { return argument(0); } ALWAYS_INLINE size_t arguments_size() const { return argument(1); } ByteString arguments_string() const override; Vector variable_arguments_to_byte_string(Optional input = {}) const; }; template class REGEX_API OpCode_CompareSimple final : public CompareInternals { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; using CompareInternals::flat_compares; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CompareSimple; } ALWAYS_INLINE size_t size() const override { return 2 + arguments_size(); } // CompareSimple * ALWAYS_INLINE size_t arguments_count() const { return 1; } ALWAYS_INLINE size_t arguments_size() const { return argument(0); } ByteString arguments_string() const override; }; template class REGEX_API OpCode_Repeat : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; } ALWAYS_INLINE size_t size() const override { return 4; } ALWAYS_INLINE size_t offset() const { return argument(0); } ALWAYS_INLINE u64 count() const { return argument(1); } ALWAYS_INLINE size_t id() const { return argument(2); } ByteString arguments_string() const override { auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0; return ByteString::formatted("offset={} [&{}] count={} id={} rep={}, sp: {}", static_cast(offset()), state().instruction_position - offset(), count() + 1, id(), reps + 1, state().string_position); } }; template class REGEX_API OpCode_ResetRepeat : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t id() const { return argument(0); } ByteString arguments_string() const override { auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0; return ByteString::formatted("id={} rep={}", id(), reps + 1); } }; template class REGEX_API OpCode_Checkpoint final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Checkpoint; } ALWAYS_INLINE size_t size() const override { return 2; } ALWAYS_INLINE size_t id() const { return argument(0); } ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); } }; template class REGEX_API OpCode_JumpNonEmpty final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::JumpNonEmpty; } ALWAYS_INLINE size_t size() const override { return 4; } ALWAYS_INLINE ssize_t offset() const { return argument(0); } ALWAYS_INLINE ssize_t checkpoint() const { return argument(1); } ALWAYS_INLINE OpCodeId form() const { return (OpCodeId)argument(2); } ByteString arguments_string() const override { return ByteString::formatted("{} offset={} [&{}], cp={}", opcode_id_name(form()), offset(), state().instruction_position + size() + offset(), checkpoint()); } }; template class REGEX_API OpCode_ForkIf final : public OpCode { public: using OpCode::argument; using OpCode::name; using OpCode::state; using OpCode::bytecode; ExecutionResult execute(MatchInput const& input, MatchState& state) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkIf; } ALWAYS_INLINE size_t size() const override { return 4; } ALWAYS_INLINE ssize_t offset() const { return argument(0); } ALWAYS_INLINE OpCodeId form() const { return (OpCodeId)argument(1); } ALWAYS_INLINE ForkIfCondition condition() const { return (ForkIfCondition)argument(2); } ByteString arguments_string() const override { return ByteString::formatted("{} {} offset={} [&{}]", opcode_id_name(form()), fork_if_condition_name(condition()), offset(), state().instruction_position + size() + offset()); } }; ALWAYS_INLINE OpCode& FlatByteCode::get_opcode(regex::MatchState& state) const { OpCodeId opcode_id; if (m_data.size() <= state.instruction_position) opcode_id = OpCodeId::Exit; else opcode_id = static_cast(m_data.data()[state.instruction_position]); if (opcode_id >= OpCodeId::First && opcode_id <= OpCodeId::Last) { } else { dbgln("Invalid OpCodeId requested: {} at {}", (u32)opcode_id, state.instruction_position); VERIFY_NOT_REACHED(); } auto& opcode = get_opcode_by_id(opcode_id); opcode.set_state(state); return opcode; } ALWAYS_INLINE OpCode& FlatByteCode::get_opcode_by_id(OpCodeId id) const { if (id >= OpCodeId::First && id <= OpCodeId::Last) { } else { dbgln("Invalid OpCodeId requested: {}", (u32)id); VERIFY_NOT_REACHED(); } auto& opcode = s_opcodes[(u32)id]; opcode->set_bytecode(*const_cast(this)); return *opcode; } ALWAYS_INLINE OpCode& ByteCode::get_opcode(regex::MatchState& state) const { OpCodeId opcode_id; if (auto opcode_ptr = static_cast const&>(*this).find(state.instruction_position)) opcode_id = (OpCodeId)*opcode_ptr; else opcode_id = OpCodeId::Exit; auto& opcode = get_opcode_by_id(opcode_id); opcode.set_state(state); return opcode; } ALWAYS_INLINE OpCode& ByteCode::get_opcode_by_id(OpCodeId id) const { VERIFY(id >= OpCodeId::First && id <= OpCodeId::Last); auto& opcode = s_opcodes[(u32)id]; opcode->set_bytecode(*const_cast(this)); return *opcode; } namespace Detail { template class T, typename ByteCode> struct Is { static bool is(OpCode const& opcode) { return ::is>(opcode); } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::FailForks; } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::Exit; } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::Compare; } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::SetStepBack; } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::IncStepBack; } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::CheckStepBack; } }; template struct Is { static bool is(OpCode const& opcode) { return opcode.opcode_id() == OpCodeId::CheckSavedPosition; } }; } template class T, typename ByteCode> bool is(OpCode const& opcode) { return Detail::Is::is(opcode); } template class T, typename ByteCode> ALWAYS_INLINE T const& to(OpCode const& opcode) { return as>(opcode); } template class T, typename ByteCode> ALWAYS_INLINE T* to(OpCode* opcode) { return as>(opcode); } template class T, typename ByteCode> ALWAYS_INLINE T const* to(OpCode const* opcode) { return as>(opcode); } template class T, typename ByteCode> ALWAYS_INLINE T& to(OpCode& opcode) { return as>(opcode); } template StringView OpCode::name(OpCodeId opcode_id) { switch (opcode_id) { #define __ENUMERATE_OPCODE(x) \ case OpCodeId::x: \ return #x##sv; ENUMERATE_OPCODES #undef __ENUMERATE_OPCODE default: VERIFY_NOT_REACHED(); return ""sv; } } }