/* * Copyright (c) 2021, Ali Mohammad Pur * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if REGEX_DEBUG # include # include #endif namespace regex { using Detail::Block; struct BytecodeRewriter { struct Instruction { size_t old_ip; size_t size; OpCodeId id; bool skip; }; Vector instructions; HashMap new_ip_mapping; StringView target_pattern; BytecodeRewriter(ByteCode const& bytecode, StringView pattern = {}) : target_pattern(pattern) { auto flat = bytecode.flat_data(); auto state = MatchState::only_for_enumeration(); for (size_t old_ip = 0; old_ip < flat.size();) { state.instruction_position = old_ip; auto& op = bytecode.get_opcode(state); auto sz = op.size(); instructions.append({ old_ip, sz, op.opcode_id(), false }); old_ip += sz; } } void mark_range_for_skip(size_t start, size_t end) { for (auto& instr : instructions) { if (instr.old_ip >= start && instr.old_ip < end) instr.skip = true; } } void build_ip_mapping(ByteCode const& bytecode, Span replacements) { new_ip_mapping.ensure_capacity(instructions.size() + 1); size_t current_new_ip = 0; auto replacements_it = replacements.begin(); for (auto& instr : instructions) { new_ip_mapping.set(instr.old_ip, current_new_ip); auto& replacement = *replacements_it; ++replacements_it; if (!instr.skip) current_new_ip += instr.size; else current_new_ip += replacement.size(); } new_ip_mapping.set(bytecode.size(), current_new_ip); } template requires(requires(Range r) { r.start_ip; r.end_ip; }) void build_ip_mapping(ByteCode const& bytecode, Span replacement_ranges, Span replacements) { new_ip_mapping.ensure_capacity(instructions.size() + 1); size_t current_new_ip = 0; auto instruction_it = instructions.begin(); for (auto i = 0uz; i < replacements.size(); ++i) { auto& range = replacement_ranges[i]; auto& replacement = replacements[i]; while (instruction_it != instructions.end()) { auto& instr = *instruction_it; if (instr.old_ip >= range.start_ip) { ASSERT(instr.old_ip < range.end_ip); new_ip_mapping.set(instr.old_ip, current_new_ip); break; } new_ip_mapping.set(instr.old_ip, current_new_ip); current_new_ip += instr.size; ++instruction_it; } current_new_ip += replacement.size(); // Skip instructions in the replacement range while (instruction_it != instructions.end()) { auto& instr = *instruction_it; if (instr.old_ip >= range.end_ip) break; ++instruction_it; } } // Map any remaining instructions for (; instruction_it != instructions.end(); ++instruction_it) { auto& instr = *instruction_it; new_ip_mapping.set(instr.old_ip, current_new_ip); current_new_ip += instr.size; } new_ip_mapping.set(bytecode.size(), current_new_ip); } template requires(requires(Range r) { r.start_ip; r.end_ip; }) ByteCode rebuild(ByteCode const& bytecode, Span replacement_ranges, Span replacements) { // Assumes that replacement_ranges and replacements are the same size // As well as in order VERIFY(replacement_ranges.size() == replacements.size()); auto flat = bytecode.flat_data(); ByteCode result; result.merge_string_tables_from({ &bytecode, 1 }); size_t total_new_size = bytecode.size(); // FIXME: Get a zip(it...) helper for (auto i = 0uz; i < replacement_ranges.size(); ++i) { mark_range_for_skip(replacement_ranges[i].start_ip, replacement_ranges[i].end_ip); total_new_size -= (replacement_ranges[i].end_ip - replacement_ranges[i].start_ip); total_new_size += replacements[i].size(); } build_ip_mapping(bytecode, replacement_ranges, replacements); result.ensure_capacity(total_new_size); // FIXME: Use a zip(...) helper auto instructions_it = instructions.begin(); for (auto i = 0uz; i < replacement_ranges.size(); ++i) { auto& range = replacement_ranges[i]; auto& replacement = replacements[i]; // Append and adjust all instructions before the replacement range for (; instructions_it != instructions.end(); ++instructions_it) { auto& instr = *instructions_it; if (instr.old_ip >= range.start_ip) { ASSERT(instr.old_ip < range.end_ip); ++instructions_it; break; } VERIFY(instr.skip == false); auto slice = Vector { flat.slice(instr.old_ip, instr.size) }; adjust_jump_in_slice(bytecode, slice, instr); result.append(move(slice)); } // Finally insert the replacement result.extend(replacement); // Skip instructions in the replacement range while (instructions_it != instructions.end()) { auto& instr = *instructions_it; if (instr.old_ip >= range.end_ip) break; ++instructions_it; } } // Append any remaining instructions for (; instructions_it != instructions.end(); ++instructions_it) { auto& instr = *instructions_it; auto slice = Vector { flat.slice(instr.old_ip, instr.size) }; adjust_jump_in_slice(bytecode, slice, instr); result.append(move(slice)); VERIFY(instr.skip == false); } result.flatten(); return result; } ByteCode rebuild(ByteCode const& bytecode, Function insert_replacement = nullptr) { auto flat = bytecode.flat_data(); ByteCode result; result.merge_string_tables_from({ &bytecode, 1 }); Vector replacements; replacements.resize_with_default_value(instructions.size(), ByteCode {}); size_t total_new_size = 0; for (auto const& [i, instr] : enumerate(instructions)) { if (!instr.skip) { total_new_size += instr.size; } else if (insert_replacement) { ByteCode temp; insert_replacement(instr, temp); total_new_size += temp.size(); replacements[i] = move(temp); } } build_ip_mapping(bytecode, replacements); result.ensure_capacity(total_new_size); auto replacements_it = replacements.begin(); for (auto& instr : instructions) { auto& replacement = *replacements_it; ++replacements_it; if (instr.skip) { result.extend(move(replacement)); continue; } auto slice = Vector { flat.slice(instr.old_ip, instr.size) }; adjust_jump_in_slice(bytecode, slice, instr); result.append(move(slice)); } result.flatten(); return result; } private: void adjust_jump_in_slice(ByteCode const& bytecode, Vector& slice, Instruction const& instr) { auto adjust = [&](size_t idx, bool is_repeat) { auto old_offset = slice[idx]; auto target_old = is_repeat ? instr.old_ip - old_offset : instr.old_ip + instr.size + old_offset; if (!new_ip_mapping.contains(target_old)) { dbgln("In pattern /{}/", target_pattern); dbgln("Target {} not found in new_ip mapping (in {})", target_old, instr.old_ip); RegexDebug dbg(stderr); dbg.print_bytecode(bytecode); VERIFY_NOT_REACHED(); } size_t target_new = *new_ip_mapping.get(target_old); size_t source_new = *new_ip_mapping.get(instr.old_ip); auto new_offset = is_repeat ? source_new - target_new : target_new - source_new - instr.size; slice[idx] = static_cast(new_offset); }; switch (instr.id) { case OpCodeId::Jump: case OpCodeId::ForkJump: case OpCodeId::ForkStay: case OpCodeId::ForkReplaceJump: case OpCodeId::ForkReplaceStay: case OpCodeId::JumpNonEmpty: case OpCodeId::ForkIf: adjust(1, false); break; case OpCodeId::Repeat: adjust(1, true); break; default: break; } } }; template static typename Regex::BasicBlockList split_basic_blocks_for_atomic_groups(ByteCode const& bytecode) { typename Regex::BasicBlockList block_boundaries; size_t end_of_last_block = 0; auto bytecode_size = bytecode.size(); auto state = MatchState::only_for_enumeration(); state.instruction_position = 0; auto check_jump = [&] class T>(auto const& opcode) { auto& op = static_cast const&>(opcode); ssize_t jump_offset = op.size() + op.offset(); if (jump_offset >= 0) { block_boundaries.append({ end_of_last_block, state.instruction_position, "Jump ahead"sv }); end_of_last_block = state.instruction_position + opcode.size(); } else { // This op jumps back, see if that's within this "block". if (jump_offset + state.instruction_position > end_of_last_block) { // Split the block! block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position, "Jump back 1"sv }); block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position, "Jump back 2"sv }); end_of_last_block = state.instruction_position + opcode.size(); } else { // Nope, it's just a jump to another block block_boundaries.append({ end_of_last_block, state.instruction_position, "Jump"sv }); end_of_last_block = state.instruction_position + opcode.size(); } } }; for (;;) { auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Jump: check_jump.template operator()(opcode); break; case OpCodeId::JumpNonEmpty: check_jump.template operator()(opcode); break; case OpCodeId::ForkJump: check_jump.template operator()(opcode); break; case OpCodeId::ForkStay: check_jump.template operator()(opcode); break; case OpCodeId::ForkIf: check_jump.template operator()(opcode); break; case OpCodeId::FailForks: block_boundaries.append({ end_of_last_block, state.instruction_position, "FailForks"sv }); end_of_last_block = state.instruction_position + opcode.size(); break; case OpCodeId::Repeat: { // Repeat produces two blocks, one containing its repeated expr, and one after that. auto& repeat = static_cast const&>(opcode); auto repeat_start = state.instruction_position - repeat.offset(); if (repeat_start > end_of_last_block) block_boundaries.append({ end_of_last_block, repeat_start, "Repeat"sv }); block_boundaries.append({ repeat_start, state.instruction_position, "Repeat after"sv }); end_of_last_block = state.instruction_position + opcode.size(); break; } default: break; } auto next_ip = state.instruction_position + opcode.size(); if (next_ip < bytecode_size) state.instruction_position = next_ip; else break; } if (end_of_last_block < bytecode_size) block_boundaries.append({ end_of_last_block, bytecode_size, "End"sv }); quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; }); return block_boundaries; } template void Regex::run_optimization_passes() { ScopeGuard switch_to_flat = [&] { parser_result.bytecode = FlatByteCode::from(move(parser_result.bytecode.template get())); }; rewrite_with_useless_jumps_removed(); auto blocks = split_basic_blocks(parser_result.bytecode.get()); if (attempt_rewrite_entire_match_as_substring_search(blocks)) { return; } // Rewrite fork loops as atomic groups // e.g. a*b -> (ATOMIC a*)b blocks = split_basic_blocks_for_atomic_groups(parser_result.bytecode.get()); attempt_rewrite_loops_as_atomic_groups(blocks); // Join adjacent compares that only match single characters into a single compare that matches a string. blocks = split_basic_blocks(parser_result.bytecode.get()); attempt_rewrite_adjacent_compares_as_string_compare(blocks); // Rewrite /.*x/ as a seek to x blocks = split_basic_blocks(parser_result.bytecode.get()); attempt_rewrite_dot_star_sequences_as_seek(blocks); // Simplify compares where possible blocks = split_basic_blocks(parser_result.bytecode.get()); rewrite_simple_compares(blocks); fill_optimization_data(split_basic_blocks(parser_result.bytecode.template get())); } struct StaticallyInterpretedCompares { RedBlackTree ranges; RedBlackTree negated_ranges; HashTable char_classes; HashTable negated_char_classes; bool has_any_unicode_property = false; HashTable unicode_general_categories; HashTable unicode_properties; HashTable unicode_scripts; HashTable unicode_script_extensions; HashTable negated_unicode_general_categories; HashTable negated_unicode_properties; HashTable negated_unicode_scripts; HashTable negated_unicode_script_extensions; }; static bool interpret_compares(Vector const& lhs, StaticallyInterpretedCompares& compares, ByteCodeBase const* bytecode = nullptr, bool as_follow = false) { bool inverse { false }; bool temporary_inverse { false }; bool reset_temporary_inverse { false }; auto current_lhs_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; }; auto& lhs_ranges = compares.ranges; auto& lhs_negated_ranges = compares.negated_ranges; auto& lhs_char_classes = compares.char_classes; auto& lhs_negated_char_classes = compares.negated_char_classes; auto& has_any_unicode_property = compares.has_any_unicode_property; auto& lhs_unicode_general_categories = compares.unicode_general_categories; auto& lhs_unicode_properties = compares.unicode_properties; auto& lhs_unicode_scripts = compares.unicode_scripts; auto& lhs_unicode_script_extensions = compares.unicode_script_extensions; auto& lhs_negated_unicode_general_categories = compares.negated_unicode_general_categories; auto& lhs_negated_unicode_properties = compares.negated_unicode_properties; auto& lhs_negated_unicode_scripts = compares.negated_unicode_scripts; auto& lhs_negated_unicode_script_extensions = compares.negated_unicode_script_extensions; for (auto const& pair : lhs) { if (reset_temporary_inverse) { reset_temporary_inverse = false; temporary_inverse = false; } else { reset_temporary_inverse = true; } switch (pair.type) { case CharacterCompareType::Inverse: inverse = !inverse; break; case CharacterCompareType::TemporaryInverse: temporary_inverse = true; reset_temporary_inverse = false; break; case CharacterCompareType::AnyChar: // Special case: if not inverted, AnyChar is always in the range. if (!current_lhs_inversion_state()) return false; break; case CharacterCompareType::Char: if (!current_lhs_inversion_state()) lhs_ranges.insert(pair.value, pair.value); else lhs_negated_ranges.insert(pair.value, pair.value); break; case CharacterCompareType::String: { if (!as_follow) return false; auto string = bytecode->get_u16_string(pair.value); u32 ch = string.code_point_at(0); if (!current_lhs_inversion_state()) lhs_ranges.insert(ch, ch); else lhs_negated_ranges.insert(ch, ch); break; } case CharacterCompareType::StringSet: return false; case CharacterCompareType::CharClass: if (!current_lhs_inversion_state()) lhs_char_classes.set(static_cast(pair.value)); else lhs_negated_char_classes.set(static_cast(pair.value)); break; case CharacterCompareType::CharRange: { auto range = CharRange(pair.value); if (!current_lhs_inversion_state()) lhs_ranges.insert(range.from, range.to); else lhs_negated_ranges.insert(range.from, range.to); break; } case CharacterCompareType::LookupTable: // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it. return false; case CharacterCompareType::Reference: case CharacterCompareType::NamedReference: // We've handled this before coming here. break; case CharacterCompareType::Property: has_any_unicode_property = true; if (!current_lhs_inversion_state()) lhs_unicode_properties.set(static_cast(pair.value)); else lhs_negated_unicode_properties.set(static_cast(pair.value)); break; case CharacterCompareType::GeneralCategory: has_any_unicode_property = true; if (!current_lhs_inversion_state()) lhs_unicode_general_categories.set(static_cast(pair.value)); else lhs_negated_unicode_general_categories.set(static_cast(pair.value)); break; case CharacterCompareType::Script: has_any_unicode_property = true; if (!current_lhs_inversion_state()) lhs_unicode_scripts.set(static_cast(pair.value)); else lhs_negated_unicode_scripts.set(static_cast(pair.value)); break; case CharacterCompareType::ScriptExtension: has_any_unicode_property = true; if (!current_lhs_inversion_state()) lhs_unicode_script_extensions.set(static_cast(pair.value)); else lhs_negated_unicode_script_extensions.set(static_cast(pair.value)); break; case CharacterCompareType::Or: case CharacterCompareType::EndAndOr: // These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below). break; case CharacterCompareType::And: case CharacterCompareType::Subtract: // FIXME: These are too difficult to handle, so bail out. return false; case CharacterCompareType::Undefined: case CharacterCompareType::RangeExpressionDummy: // These do not occur in valid bytecode. VERIFY_NOT_REACHED(); } } return true; } template void Regex::fill_optimization_data(BasicBlockList const& blocks) { if (blocks.is_empty()) return; if constexpr (REGEX_DEBUG) { dbgln("Pulling out optimization data from bytecode:"); RegexDebug dbg; dbg.print_bytecode(*this); for (auto const& block : blocks) dbgln("block from {} to {} (comment: {})", block.start, block.end, block.comment); } ScopeGuard print = [&] { if constexpr (REGEX_DEBUG) { dbgln("Optimization data:"); if (parser_result.optimization_data.starting_ranges.is_empty()) dbgln("; - no starting ranges"); for (auto const& range : parser_result.optimization_data.starting_ranges) dbgln(" - starting range: {}-{}", range.from, range.to); dbgln("; - only start of line: {}", parser_result.optimization_data.only_start_of_line); } }; auto& bytecode = parser_result.bytecode.get(); auto state = MatchState::only_for_enumeration(); auto block = blocks.first(); for (state.instruction_position = block.start; state.instruction_position < block.end;) { auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Compare: { auto& compare = to(opcode); if (compare.arguments_count() == 0) return; // This matches 'nothing', so there are no starting ranges that can satisfy it. auto flat_compares = compare.flat_compares(); StaticallyInterpretedCompares compares; if (!interpret_compares(flat_compares, compares)) return; // No idea, the bytecode is too complex. if (compares.has_any_unicode_property) return; // Faster to just run the bytecode. // FIXME: We should be able to handle these cases (jump ahead while...) if (!compares.char_classes.is_empty() || !compares.negated_char_classes.is_empty() || !compares.negated_ranges.is_empty()) return; for (auto it = compares.ranges.begin(); it != compares.ranges.end(); ++it) { parser_result.optimization_data.starting_ranges.append({ it.key(), *it }); for (auto range : Unicode::expand_range_case_insensitive(it.key(), *it)) parser_result.optimization_data.starting_ranges_insensitive.append({ range.from, range.to }); } quick_sort(parser_result.optimization_data.starting_ranges_insensitive, [](CharRange a, CharRange b) { return a.from < b.from; }); return; } case OpCodeId::CompareSimple: { auto& compare = to(opcode); auto flat_compares = compare.flat_compares(); StaticallyInterpretedCompares compares; if (!interpret_compares(flat_compares, compares)) return; // No idea, the bytecode is too complex. if (compares.has_any_unicode_property) return; // Faster to just run the bytecode. // FIXME: We should be able to handle these cases (jump ahead while...) if (!compares.char_classes.is_empty() || !compares.negated_char_classes.is_empty() || !compares.negated_ranges.is_empty()) return; for (auto it = compares.ranges.begin(); it != compares.ranges.end(); ++it) { parser_result.optimization_data.starting_ranges.append({ it.key(), *it }); for (auto range : Unicode::expand_range_case_insensitive(it.key(), *it)) parser_result.optimization_data.starting_ranges_insensitive.append({ range.from, range.to }); } quick_sort(parser_result.optimization_data.starting_ranges_insensitive, [](CharRange a, CharRange b) { return a.from < b.from; }); return; } case OpCodeId::CheckBegin: parser_result.optimization_data.only_start_of_line = true; return; case OpCodeId::Checkpoint: case OpCodeId::Save: case OpCodeId::ClearCaptureGroup: case OpCodeId::SaveLeftCaptureGroup: // These do not 'match' anything, so look through them. state.instruction_position += opcode.size(); continue; default: return; } } } template typename Regex::BasicBlockList Regex::split_basic_blocks(ByteCode const& bytecode) { BasicBlockList block_boundaries; HashTable block_starts; auto bytecode_size = bytecode.size(); block_starts.set(0); auto state = MatchState::only_for_enumeration(); state.instruction_position = 0; auto check_jump = [&] class T>(auto const& opcode) { auto& op = static_cast const&>(opcode); ssize_t jump_offset = op.size() + op.offset(); ssize_t target = state.instruction_position + jump_offset; block_starts.set(target); block_starts.set(state.instruction_position + opcode.size()); }; for (;;) { auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Jump: check_jump.template operator()(opcode); break; case OpCodeId::JumpNonEmpty: check_jump.template operator()(opcode); break; case OpCodeId::ForkJump: check_jump.template operator()(opcode); break; case OpCodeId::ForkStay: check_jump.template operator()(opcode); break; case OpCodeId::ForkIf: check_jump.template operator()(opcode); break; case OpCodeId::FailForks: block_starts.set(state.instruction_position + opcode.size()); break; case OpCodeId::Repeat: { auto& repeat = to(opcode); auto repeat_start = state.instruction_position - repeat.offset(); block_starts.set(repeat_start); block_starts.set(state.instruction_position + opcode.size()); break; } default: break; } auto next_ip = state.instruction_position + opcode.size(); if (next_ip < bytecode_size) state.instruction_position = next_ip; else break; } Vector sorted_starts; for (auto start : block_starts) sorted_starts.append(start); quick_sort(sorted_starts); for (size_t i = 0; i < sorted_starts.size(); ++i) { size_t start = sorted_starts[i]; size_t end; if (i + 1 < sorted_starts.size()) { size_t next_block_start = sorted_starts[i + 1]; state.instruction_position = start; size_t last_ip = start; while (state.instruction_position < next_block_start) { last_ip = state.instruction_position; auto& opcode = bytecode.get_opcode(state); state.instruction_position += opcode.size(); } end = last_ip; } else { state.instruction_position = start; size_t last_ip = start; while (state.instruction_position < bytecode_size) { last_ip = state.instruction_position; auto& opcode = bytecode.get_opcode(state); auto next_ip = state.instruction_position + opcode.size(); if (next_ip >= bytecode_size) break; state.instruction_position = next_ip; } end = last_ip; } block_boundaries.append({ start, end, "Block"sv }); } return block_boundaries; } static bool has_overlap(Vector const& lhs, Vector const& rhs) { // We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover). bool inverse { false }; bool temporary_inverse { false }; bool reset_temporary_inverse { false }; auto current_lhs_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; }; StaticallyInterpretedCompares compares; auto& lhs_ranges = compares.ranges; auto& lhs_negated_ranges = compares.negated_ranges; auto& lhs_char_classes = compares.char_classes; auto& lhs_negated_char_classes = compares.negated_char_classes; auto& has_any_unicode_property = compares.has_any_unicode_property; auto& lhs_unicode_general_categories = compares.unicode_general_categories; auto& lhs_unicode_properties = compares.unicode_properties; auto& lhs_unicode_scripts = compares.unicode_scripts; auto& lhs_unicode_script_extensions = compares.unicode_script_extensions; auto& lhs_negated_unicode_general_categories = compares.negated_unicode_general_categories; auto& lhs_negated_unicode_properties = compares.negated_unicode_properties; auto& lhs_negated_unicode_scripts = compares.negated_unicode_scripts; auto& lhs_negated_unicode_script_extensions = compares.negated_unicode_script_extensions; auto any_unicode_property_matches = [&](u32 code_point) { if (any_of(lhs_negated_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); })) return false; if (any_of(lhs_negated_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); })) return false; if (any_of(lhs_negated_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); })) return false; if (any_of(lhs_negated_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); })) return false; if (any_of(lhs_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); })) return true; if (any_of(lhs_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); })) return true; if (any_of(lhs_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); })) return true; if (any_of(lhs_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); })) return true; return false; }; auto range_contains = [&](T& value) -> bool { u32 start; u32 end; if constexpr (IsSame) { start = value.from; end = value.to; } else { start = value; end = value; } if (has_any_unicode_property) { // We have some properties, and a range is present // Instead of checking every single code point in the range, assume it's a match. return start != end || any_unicode_property_matches(start); } auto* max = lhs_ranges.find_smallest_not_below(start); return max && *max <= end; }; auto char_class_contains = [&](CharClass const& value) -> bool { if (lhs_char_classes.contains(value)) return true; if (lhs_negated_char_classes.contains(value)) return false; if (lhs_ranges.is_empty()) return false; for (auto it = lhs_ranges.begin(); it != lhs_ranges.end(); ++it) { auto start = it.key(); auto end = *it; for (u32 ch = start; ch <= end; ++ch) { if (OpCode_Compare::matches_character_class(value, ch, false, false)) return true; } } return false; }; if (!interpret_compares(lhs, compares)) return true; // We can't interpret this, so we can't optimize it. if constexpr (REGEX_DEBUG) { dbgln("lhs ranges:"); for (auto it = lhs_ranges.begin(); it != lhs_ranges.end(); ++it) dbgln(" {}..{}", it.key(), *it); dbgln("lhs negated ranges:"); for (auto it = lhs_negated_ranges.begin(); it != lhs_negated_ranges.end(); ++it) dbgln(" {}..{}", it.key(), *it); } temporary_inverse = false; reset_temporary_inverse = false; inverse = false; struct DisjunctionState { bool in_or = false; // We're in an OR block, so we should wait for the EndAndOr to decide if we would match. bool matched_in_or = false; bool inverse_matched_in_or = false; }; Vector disjunction_stack; disjunction_stack.empend(); auto in_or = [&] -> bool& { return disjunction_stack.last().in_or; }; auto matched_in_or = [&] -> bool& { return disjunction_stack.last().matched_in_or; }; auto inverse_matched_in_or = [&] -> bool& { return disjunction_stack.last().inverse_matched_in_or; }; for (auto const& pair : rhs) { if (reset_temporary_inverse) { reset_temporary_inverse = false; temporary_inverse = false; } else { reset_temporary_inverse = true; } if constexpr (REGEX_DEBUG) { dbgln("check {} ({}) [inverted? {}] against {{", character_compare_type_name(pair.type), pair.value, current_lhs_inversion_state()); for (auto it = lhs_ranges.begin(); it != lhs_ranges.end(); ++it) dbgln(" {}..{}", it.key(), *it); for (auto it = lhs_negated_ranges.begin(); it != lhs_negated_ranges.end(); ++it) dbgln(" ^[{}..{}]", it.key(), *it); for (auto& char_class : lhs_char_classes) dbgln(" {}", character_class_name(char_class)); for (auto& char_class : lhs_negated_char_classes) dbgln(" ^{}", character_class_name(char_class)); dbgln("}}, in or: {}, matched in or: {}, inverse matched in or: {}", in_or(), matched_in_or(), inverse_matched_in_or()); } switch (pair.type) { case CharacterCompareType::Inverse: inverse = !inverse; break; case CharacterCompareType::TemporaryInverse: temporary_inverse = true; reset_temporary_inverse = false; break; case CharacterCompareType::AnyChar: // Special case: if not inverted, AnyChar is always in the range. if (!in_or() && !current_lhs_inversion_state()) return true; if (in_or()) { matched_in_or() = true; inverse_matched_in_or() = false; } break; case CharacterCompareType::Char: { auto matched = range_contains(pair.value); if (!in_or() && (current_lhs_inversion_state() ^ matched)) return true; if (in_or()) { matched_in_or() |= matched; inverse_matched_in_or() |= !matched; } break; } case CharacterCompareType::String: // FIXME: We just need to look at the last character of this string, but we only have the first character here. // Just bail out to avoid false positives. return true; case CharacterCompareType::StringSet: return true; case CharacterCompareType::CharClass: { auto contains = char_class_contains(static_cast(pair.value)); if (!in_or() && (current_lhs_inversion_state() ^ contains)) return true; if (in_or()) { matched_in_or() |= contains; inverse_matched_in_or() |= !contains; } break; } case CharacterCompareType::CharRange: { auto range = CharRange(pair.value); auto contains = range_contains(range); if (!in_or() && (contains ^ current_lhs_inversion_state())) return true; if (in_or()) { matched_in_or() |= contains; inverse_matched_in_or() |= !contains; } break; } case CharacterCompareType::LookupTable: // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it. return true; case CharacterCompareType::Reference: case CharacterCompareType::NamedReference: // We've handled this before coming here. break; case CharacterCompareType::Property: // The only reasonable scenario where we can check these properties without spending too much time is if: // - the ranges are empty // - the char classes are empty // - the unicode properties are empty or contain only this property if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) return true; if (has_any_unicode_property && !lhs_unicode_properties.is_empty() && !lhs_negated_unicode_properties.is_empty()) { auto contains = lhs_unicode_properties.contains(static_cast(pair.value)); if (!in_or() && (current_lhs_inversion_state() ^ contains)) return true; auto inverse_contains = lhs_negated_unicode_properties.contains(static_cast(pair.value)); if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains)) return true; if (in_or()) { matched_in_or() |= contains; inverse_matched_in_or() |= inverse_contains; } } break; case CharacterCompareType::GeneralCategory: if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) return true; if (has_any_unicode_property && !lhs_unicode_general_categories.is_empty() && !lhs_negated_unicode_general_categories.is_empty()) { auto contains = lhs_unicode_general_categories.contains(static_cast(pair.value)); if (!in_or() && (current_lhs_inversion_state() ^ contains)) return true; auto inverse_contains = lhs_negated_unicode_general_categories.contains(static_cast(pair.value)); if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains)) return true; if (in_or()) { matched_in_or() |= contains; inverse_matched_in_or() |= inverse_contains; } } break; case CharacterCompareType::Script: if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) return true; if (has_any_unicode_property && !lhs_unicode_scripts.is_empty() && !lhs_negated_unicode_scripts.is_empty()) { auto contains = lhs_unicode_scripts.contains(static_cast(pair.value)); if (!in_or() && (current_lhs_inversion_state() ^ contains)) return true; auto inverse_contains = lhs_negated_unicode_scripts.contains(static_cast(pair.value)); if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains)) return true; if (in_or()) { matched_in_or() |= contains; inverse_matched_in_or() |= inverse_contains; } } break; case CharacterCompareType::ScriptExtension: if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty()) return true; if (has_any_unicode_property && !lhs_unicode_script_extensions.is_empty() && !lhs_negated_unicode_script_extensions.is_empty()) { auto contains = lhs_unicode_script_extensions.contains(static_cast(pair.value)); if (!in_or() && (current_lhs_inversion_state() ^ contains)) return true; auto inverse_contains = lhs_negated_unicode_script_extensions.contains(static_cast(pair.value)); if (!in_or() && !(current_lhs_inversion_state() ^ inverse_contains)) return true; if (in_or()) { matched_in_or() |= contains; inverse_matched_in_or() |= inverse_contains; } } break; case CharacterCompareType::Or: disjunction_stack.empend(true); break; case CharacterCompareType::EndAndOr: { // FIXME: Handle And when we support it below. VERIFY(in_or()); auto state = disjunction_stack.take_last(); if (current_lhs_inversion_state()) { if (!state.inverse_matched_in_or) return true; } else { if (state.matched_in_or) return true; } break; } case CharacterCompareType::And: case CharacterCompareType::Subtract: // FIXME: These are too difficult to handle, so bail out. return true; case CharacterCompareType::Undefined: case CharacterCompareType::RangeExpressionDummy: // These do not occur in valid bytecode. VERIFY_NOT_REACHED(); } } // We got to the end, just double-check that the inverse flag was not left on (which would match everything). return current_lhs_inversion_state(); } static bool has_overlap(StaticallyInterpretedCompares const& lhs, StaticallyInterpretedCompares const& rhs) { if (lhs.has_any_unicode_property || rhs.has_any_unicode_property || !lhs.negated_ranges.is_empty() || !rhs.negated_ranges.is_empty() || !lhs.negated_char_classes.is_empty() || !rhs.negated_char_classes.is_empty()) return true; for (auto it_lhs = lhs.ranges.begin(); it_lhs != lhs.ranges.end(); ++it_lhs) { auto lhs_start = it_lhs.key(); auto lhs_end = *it_lhs; for (auto it_rhs = rhs.ranges.begin(); it_rhs != rhs.ranges.end(); ++it_rhs) { auto rhs_start = it_rhs.key(); auto rhs_end = *it_rhs; // Check if ranges overlap if (lhs_start <= rhs_end && rhs_start <= lhs_end) { return true; } } } for (auto& lhs_class : lhs.char_classes) { for (auto& rhs_class : rhs.char_classes) { if (lhs_class == rhs_class) return true; } } return false; } enum class AtomicRewritePreconditionResult { SatisfiedWithProperHeader, SatisfiedWithEmptyHeader, NotSatisfied, }; static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block repeated_block, Block following_block, auto const& all_blocks) { Vector> repeated_values; auto state = MatchState::only_for_enumeration(); auto has_seen_actionable_opcode = false; for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) { auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Compare: { has_seen_actionable_opcode = true; auto compares = to(opcode).flat_compares(); if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; })) return AtomicRewritePreconditionResult::NotSatisfied; repeated_values.append(move(compares)); break; } case OpCodeId::CheckBegin: case OpCodeId::CheckEnd: has_seen_actionable_opcode = true; if (repeated_values.is_empty()) return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; break; case OpCodeId::CheckBoundary: // FIXME: What should we do with these? for now, let's fail. return AtomicRewritePreconditionResult::NotSatisfied; case OpCodeId::Restore: case OpCodeId::GoBack: return AtomicRewritePreconditionResult::NotSatisfied; case OpCodeId::ForkJump: case OpCodeId::ForkReplaceJump: case OpCodeId::ForkIf: case OpCodeId::JumpNonEmpty: // We could attempt to recursively resolve the follow set, but pretending that this just goes nowhere is faster. if (!has_seen_actionable_opcode) return AtomicRewritePreconditionResult::NotSatisfied; break; case OpCodeId::Jump: { // Just follow the jump, it's unconditional. auto& jump = to(opcode); auto jump_target = state.instruction_position + jump.offset() + jump.size(); // Find the block that this jump leads to. auto next_block_it = find_if(all_blocks.begin(), all_blocks.end(), [jump_target](auto& block) { return block.start == jump_target; }); if (next_block_it == all_blocks.end()) return AtomicRewritePreconditionResult::NotSatisfied; repeated_block = *next_block_it; state.instruction_position = repeated_block.start; continue; } default: break; } state.instruction_position += opcode.size(); } dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size()); auto accept_empty_follow = false; while (following_block.start == following_block.end && !accept_empty_follow) { dbgln_if(REGEX_DEBUG, "Following empty block {}", following_block.start); // If the following block has a single instruction, it must be some kind of jump. // Unless it's an unconditional jump, we can't rewrite it - so bail out. state.instruction_position = following_block.start; auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Jump: { // Just follow the jump, it's unconditional. auto& jump = to(opcode); auto jump_target = state.instruction_position + jump.offset() + jump.size(); if (jump_target < state.instruction_position) { dbgln_if(REGEX_DEBUG, "Jump to {} is backwards, I'm scared of loops", jump_target); return AtomicRewritePreconditionResult::NotSatisfied; } dbgln_if(REGEX_DEBUG, "Following jump to {}", jump_target); // Find the block that this jump leads to. auto next_block_it = find_if(all_blocks.begin(), all_blocks.end(), [jump_target](auto& block) { return block.start == jump_target; }); if (next_block_it == all_blocks.end()) return AtomicRewritePreconditionResult::NotSatisfied; following_block = *next_block_it; state.instruction_position = repeated_block.start; continue; } case OpCodeId::ForkJump: case OpCodeId::ForkIf: case OpCodeId::ForkReplaceJump: case OpCodeId::JumpNonEmpty: return AtomicRewritePreconditionResult::NotSatisfied; default: // No interesting effect here. dbgln_if(REGEX_DEBUG, "Empty follow had instruction {}", opcode.to_byte_string()); accept_empty_follow = true; break; } } bool following_block_has_at_least_one_compare = false; // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'. auto final_instruction = following_block.start; for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) { final_instruction = state.instruction_position; auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Compare: { following_block_has_at_least_one_compare = true; // We found a compare, let's see what it has. auto compares = to(opcode).flat_compares(); if (compares.is_empty()) break; if (any_of(compares, [&](auto& compare) { return compare.type == CharacterCompareType::AnyChar || compare.type == CharacterCompareType::Reference || compare.type == CharacterCompareType::NamedReference; })) return AtomicRewritePreconditionResult::NotSatisfied; if (any_of(repeated_values, [&](auto& repeated_value) { return has_overlap(compares, repeated_value); })) return AtomicRewritePreconditionResult::NotSatisfied; return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; } case OpCodeId::CheckBegin: case OpCodeId::CheckEnd: return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; // Nothing can match the end! case OpCodeId::CheckBoundary: // FIXME: What should we do with these? For now, consider them a failure. return AtomicRewritePreconditionResult::NotSatisfied; case OpCodeId::ForkJump: case OpCodeId::ForkIf: case OpCodeId::ForkReplaceJump: case OpCodeId::JumpNonEmpty: // See note in the previous switch, same cases. if (!following_block_has_at_least_one_compare) return AtomicRewritePreconditionResult::NotSatisfied; break; default: break; } state.instruction_position += opcode.size(); } // If the following block falls through, we can't rewrite it. state.instruction_position = final_instruction; switch (bytecode.get_opcode(state).opcode_id()) { case OpCodeId::Jump: case OpCodeId::JumpNonEmpty: case OpCodeId::ForkJump: case OpCodeId::ForkReplaceJump: case OpCodeId::ForkIf: break; default: return AtomicRewritePreconditionResult::NotSatisfied; } if (following_block_has_at_least_one_compare) return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader; } template bool Regex::attempt_rewrite_entire_match_as_substring_search(BasicBlockList const& basic_blocks) { // If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }). if (basic_blocks.size() > 1) return false; if (basic_blocks.is_empty()) { parser_result.optimization_data.pure_substring_search.emplace(); return true; // Empty regex, sure. } auto& bytecode = parser_result.bytecode.get(); // We have a single basic block, let's see if it's a series of character or string compares. Vector u16_units; auto state = MatchState::only_for_enumeration(); while (state.instruction_position < bytecode.size()) { auto& opcode = bytecode.get_opcode(state); switch (opcode.opcode_id()) { case OpCodeId::Compare: { auto& compare = to(opcode); if (compare.arguments_count() == 0) return false; // This matches 'nothing', so we can't do a substring search. for (auto& flat_compare : compare.flat_compares()) { if (flat_compare.type != CharacterCompareType::Char) return false; (void)AK::UnicodeUtils::code_point_to_utf16(flat_compare.value, [&](auto code_unit) { u16_units.append(code_unit); }); } break; } default: return false; } state.instruction_position += opcode.size(); } parser_result.optimization_data.pure_substring_search.emplace(move(u16_units)); return true; } template void Regex::rewrite_with_useless_jumps_removed() { auto& bytecode = parser_result.bytecode.get(); if constexpr (REGEX_DEBUG) { RegexDebug dbg; dbg.print_bytecode(*this); } BytecodeRewriter rewriter(bytecode, pattern_value); auto state = MatchState::only_for_enumeration(); for (auto& instr : rewriter.instructions) { state.instruction_position = instr.old_ip; auto& op = bytecode.get_opcode(state); bool is_useless = false; if (op.opcode_id() == OpCodeId::Jump) { is_useless = to(op).offset() == 0; } else if (op.opcode_id() == OpCodeId::JumpNonEmpty) { is_useless = to(op).offset() == 0; } else if (op.opcode_id() == OpCodeId::ForkJump || op.opcode_id() == OpCodeId::ForkReplaceJump) { is_useless = to(op).offset() == 0; } else if (op.opcode_id() == OpCodeId::ForkStay || op.opcode_id() == OpCodeId::ForkReplaceStay) { is_useless = to(op).offset() == 0; } else if (op.opcode_id() == OpCodeId::ForkIf) { is_useless = to(op).offset() == 0; } instr.skip = is_useless; } parser_result.bytecode = rewriter.rebuild(bytecode); } template void Regex::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks) { auto& bytecode = parser_result.bytecode.get(); if constexpr (REGEX_DEBUG) { RegexDebug dbg; dbg.print_bytecode(*this); for (auto const& block : basic_blocks) dbgln("block from {} to {} (comment: {})", block.start, block.end, block.comment); } // A pattern such as: // bb0 | RE0 // | ForkX bb0 // ------------------------- // bb1 | RE1 // can be rewritten as: // ------------------------- // bb0 | RE0 // | ForkReplaceX bb0 // ------------------------- // bb1 | RE1 // provided that first(RE1) not-in end(RE0), which is to say // that RE1 cannot start with whatever RE0 has matched (ever). // // Alternatively, a second form of this pattern can also occur: // bb0 | * // | ForkX bb2 // ------------------------ // bb1 | RE0 // | Jump bb0 // ------------------------ // bb2 | RE1 // which can be transformed (with the same preconditions) to: // bb0 | * // | ForkReplaceX bb2 // ------------------------ // bb1 | RE0 // | Jump bb0 // ------------------------ // bb2 | RE1 enum class AlternateForm { DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form. DirectLoopWithoutHeaderAndEmptyFollow, // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty. DirectLoopWithHeader, // loop with proper header, i.e. the second form. }; struct CandidateBlock { Block forking_block; Optional new_target_block; AlternateForm form; }; Vector candidate_blocks; auto state = MatchState::only_for_enumeration(); auto is_an_eligible_jump = [&state](auto& opcode, size_t ip, size_t block_start, AlternateForm alternate_form) { opcode.set_state(state); switch (opcode.opcode_id()) { case OpCodeId::JumpNonEmpty: { auto const& op = to(opcode); auto form = op.form(); if (form != OpCodeId::Jump && alternate_form == AlternateForm::DirectLoopWithHeader) return false; if (form != OpCodeId::ForkJump && form != OpCodeId::ForkStay && alternate_form == AlternateForm::DirectLoopWithoutHeader) return false; return op.offset() + ip + opcode.size() == block_start; } case OpCodeId::ForkJump: if (alternate_form == AlternateForm::DirectLoopWithHeader) return false; return to(opcode).offset() + ip + opcode.size() == block_start; case OpCodeId::ForkStay: if (alternate_form == AlternateForm::DirectLoopWithHeader) return false; return to(opcode).offset() + ip + opcode.size() == block_start; case OpCodeId::Jump: // Infinite loop does *not* produce forks. if (alternate_form == AlternateForm::DirectLoopWithoutHeader) return false; if (alternate_form == AlternateForm::DirectLoopWithHeader) return to(opcode).offset() + ip + opcode.size() == block_start; VERIFY_NOT_REACHED(); default: return false; } }; for (size_t i = 0; i < basic_blocks.size(); ++i) { auto forking_block = basic_blocks[i]; Optional fork_fallback_block; if (i + 1 < basic_blocks.size()) fork_fallback_block = basic_blocks[i + 1]; // Check if the last instruction in this block is a jump to the block itself: { state.instruction_position = forking_block.end; auto& opcode = bytecode.get_opcode(state); if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithoutHeader)) { // We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies. // if RE1 is empty, there's no first(RE1), so this is an automatic pass. if (!fork_fallback_block.has_value() || (fork_fallback_block->end == fork_fallback_block->start && block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block, basic_blocks) != AtomicRewritePreconditionResult::NotSatisfied)) { candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader }); break; } auto precondition = block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block, basic_blocks); if (precondition == AtomicRewritePreconditionResult::SatisfiedWithProperHeader) { candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader }); break; } if (precondition == AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader) { candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeaderAndEmptyFollow }); break; } } } // Check if the last instruction in the last block is a direct jump to this block if (fork_fallback_block.has_value()) { state.instruction_position = fork_fallback_block->end; auto& opcode = bytecode.get_opcode(state); if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithHeader)) { // We've found bb1 and bb0, let's just make sure that bb0 forks to bb2. state.instruction_position = forking_block.end; auto& opcode = bytecode.get_opcode(state); if (opcode.opcode_id() == OpCodeId::ForkJump || opcode.opcode_id() == OpCodeId::ForkStay) { Optional block_following_fork_fallback; if (i + 2 < basic_blocks.size()) block_following_fork_fallback = basic_blocks[i + 2]; if (!block_following_fork_fallback.has_value() || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback, basic_blocks) != AtomicRewritePreconditionResult::NotSatisfied) { candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader }); break; } } } // We've found a slightly degenerate case, where the next block jumps back to the _jump_ instruction in the forking block. // This is a direct loop without a proper header that is posing as a loop with a header. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.end, AlternateForm::DirectLoopWithHeader)) { // We've found bb1 and bb0, let's just make sure that bb0 forks to bb2. state.instruction_position = forking_block.end; auto& opcode = bytecode.get_opcode(state); if (opcode.opcode_id() == OpCodeId::ForkJump || opcode.opcode_id() == OpCodeId::ForkStay) { Optional block_following_fork_fallback; if (i + 2 < basic_blocks.size()) block_following_fork_fallback = basic_blocks[i + 2]; if (!block_following_fork_fallback.has_value() || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback, basic_blocks) != AtomicRewritePreconditionResult::NotSatisfied) { candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithoutHeader }); break; } } } } } dbgln_if(REGEX_DEBUG, "Found {} candidate blocks", candidate_blocks.size()); if constexpr (REGEX_DEBUG) { for (auto const& candidate : candidate_blocks) { dbgln("Candidate block from {} to {} (comment: {})", candidate.forking_block.start, candidate.forking_block.end, candidate.forking_block.comment); if (candidate.new_target_block.has_value()) dbgln(" with target block from {} to {} (comment: {})", candidate.new_target_block->start, candidate.new_target_block->end, candidate.new_target_block->comment); switch (candidate.form) { case AlternateForm::DirectLoopWithoutHeader: dbgln(" form: DirectLoopWithoutHeader"); break; case AlternateForm::DirectLoopWithoutHeaderAndEmptyFollow: dbgln(" form: DirectLoopWithoutHeaderAndEmptyFollow"); break; case AlternateForm::DirectLoopWithHeader: dbgln(" form: DirectLoopWithHeader"); break; default: dbgln(" form: Unknown"); break; } } } if (candidate_blocks.is_empty()) { dbgln_if(REGEX_DEBUG, "Failed to find anything for {}", pattern_value); return; } RedBlackTree needed_patches; // Reverse the blocks, so we can patch the bytecode without messing with the latter patches. quick_sort(candidate_blocks, [](auto& a, auto& b) { return b.forking_block.start > a.forking_block.start; }); for (auto& candidate : candidate_blocks) { // Note that both forms share a ForkReplace patch in forking_block. // Patch the ForkX in forking_block to be a ForkReplaceX instead. auto& opcode_id = bytecode[candidate.forking_block.end]; if (opcode_id == (ByteCodeValueType)OpCodeId::ForkStay) { opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay; } else if (opcode_id == (ByteCodeValueType)OpCodeId::ForkJump) { opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump; } else if (opcode_id == (ByteCodeValueType)OpCodeId::JumpNonEmpty) { auto& jump_opcode_id = bytecode[candidate.forking_block.end + 3]; if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkStay) jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay; else if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkJump) jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump; else VERIFY_NOT_REACHED(); } else { VERIFY_NOT_REACHED(); } } if (!needed_patches.is_empty()) { auto state = MatchState::only_for_enumeration(); auto bytecode_size = bytecode.size(); state.instruction_position = 0; struct Patch { ssize_t value; size_t offset; bool should_negate { false }; }; for (;;) { if (state.instruction_position >= bytecode_size) break; auto& opcode = bytecode.get_opcode(state); Stack patch_points; switch (opcode.opcode_id()) { case OpCodeId::Jump: patch_points.push({ to(opcode).offset(), state.instruction_position + 1 }); break; case OpCodeId::JumpNonEmpty: patch_points.push({ to(opcode).offset(), state.instruction_position + 1 }); patch_points.push({ to(opcode).checkpoint(), state.instruction_position + 2 }); break; case OpCodeId::ForkJump: patch_points.push({ to(opcode).offset(), state.instruction_position + 1 }); break; case OpCodeId::ForkStay: patch_points.push({ to(opcode).offset(), state.instruction_position + 1 }); break; case OpCodeId::ForkIf: patch_points.push({ to(opcode).offset(), state.instruction_position + 1 }); break; case OpCodeId::Repeat: patch_points.push({ -(ssize_t)to(opcode).offset(), state.instruction_position + 1, true }); break; default: break; } while (!patch_points.is_empty()) { auto& patch_point = patch_points.top(); auto target_offset = patch_point.value + state.instruction_position + opcode.size(); constexpr auto do_patch = [](auto& patch_it, auto& patch_point, auto& target_offset, auto& bytecode, auto ip) { if (patch_it.key() == ip) return; if (patch_point.value < 0 && target_offset <= patch_it.key() && ip > patch_it.key()) bytecode[patch_point.offset] += (patch_point.should_negate ? 1 : -1) * (*patch_it); else if (patch_point.value > 0 && target_offset >= patch_it.key() && ip < patch_it.key()) bytecode[patch_point.offset] += (patch_point.should_negate ? -1 : 1) * (*patch_it); }; if (auto patch_it = needed_patches.find_largest_not_above_iterator(target_offset); !patch_it.is_end()) do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position); else if (auto patch_it = needed_patches.find_largest_not_above_iterator(state.instruction_position); !patch_it.is_end()) do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position); patch_points.pop(); } state.instruction_position += opcode.size(); } } if constexpr (REGEX_DEBUG) { warnln("Transformed to:"); RegexDebug dbg; dbg.print_bytecode(*this); } } template void Regex::attempt_rewrite_adjacent_compares_as_string_compare(BasicBlockList const& basic_blocks) { auto& bytecode = parser_result.bytecode.get(); if (basic_blocks.is_empty()) return; // Find sequences of single-character compares struct StringSequence { size_t start_ip; size_t end_ip; Vector characters; }; Vector sequences; for (auto const& block : basic_blocks) { auto state = MatchState::only_for_enumeration(); Vector current_chars; size_t sequence_start = 0; bool in_sequence = false; for (state.instruction_position = block.start; state.instruction_position <= block.end;) { auto current_ip = state.instruction_position; auto& opcode = bytecode.get_opcode(state); bool is_single_char = false; u32 character = 0; if (opcode.opcode_id() == OpCodeId::Compare) { auto& compare = to(opcode); auto flat_compares = compare.flat_compares(); if (flat_compares.size() == 1 && flat_compares[0].type == CharacterCompareType::Char) { is_single_char = true; character = flat_compares[0].value; } } if (is_single_char) { if (!in_sequence) { sequence_start = current_ip; current_chars.clear(); in_sequence = true; } current_chars.append(character); } else { if (in_sequence && current_chars.size() >= 2) { sequences.append({ sequence_start, current_ip, move(current_chars) }); current_chars.clear(); } in_sequence = false; } state.instruction_position += opcode.size(); } if (in_sequence && current_chars.size() >= 2) { sequences.append({ sequence_start, state.instruction_position, move(current_chars) }); } } if (sequences.is_empty()) return; BytecodeRewriter rewriter(bytecode, pattern_value); Vector replacements; replacements.ensure_capacity(sequences.size()); for (auto const& seq : sequences) { StringBuilder string_builder(StringBuilder::Mode::UTF16); for (auto ch : seq.characters) string_builder.append_code_point(ch); ByteCode replacement; replacement.insert_bytecode_compare_string(string_builder.to_utf16_string()); replacements.append(move(replacement)); } parser_result.bytecode = rewriter.rebuild(bytecode, sequences.span(), replacements.span()); } template void Regex::attempt_rewrite_dot_star_sequences_as_seek(BasicBlockList const& basic_blocks) { auto& bytecode = parser_result.bytecode.get(); if (basic_blocks.is_empty()) { dbgln_if(REGEX_DEBUG, "No basic blocks, skipping /.*/ rewrite"); return; } // If a /.*/ sequence is followed by a compare C (with some non-matching ops {O} in between), we can rewrite: // bbN: {O0} (optional non-matching ops before the pattern) // ForkStay bbM // Checkpoint p // Compare AnyChar // FailIfEmpty (optional, noop for .*) // JumpNonEmpty (back to ForkStay) p // bbM: {O1} (optional non-matching ops) // Compare C // as // bbN: {O0} // bbR: RSeekTo C // ForkStay bbR // bbM: {O1} // Compare C // // Note: bbM is determined by the ForkStay's target, not necessarily the next sequential block // Note: The pattern may span across multiple basic blocks struct DotStarCandidate { size_t fork_ip; size_t checkpoint_ip; size_t compare_ip; size_t jump_ip; size_t following_block_start; u64 checkpoint_id; u32 seek_code_point; }; OrderedHashMap candidates; auto state = MatchState::only_for_enumeration(); for (size_t i = 0; i < basic_blocks.size(); ++i) { auto const& block = basic_blocks[i]; state.instruction_position = block.start; if (state.instruction_position > block.end) continue; // Skip non-matching ops at the start of the block while (state.instruction_position <= block.end) { auto& op = bytecode.get_opcode(state); switch (op.opcode_id()) { case OpCodeId::Checkpoint: case OpCodeId::Save: case OpCodeId::SaveLeftCaptureGroup: case OpCodeId::SaveRightCaptureGroup: case OpCodeId::SaveRightNamedCaptureGroup: case OpCodeId::ClearCaptureGroup: state.instruction_position += op.size(); continue; default: goto found_potential_fork; } } { if (state.instruction_position >= bytecode.size()) continue; auto& op_at_boundary = bytecode.get_opcode(state); if (op_at_boundary.opcode_id() != OpCodeId::ForkStay) continue; } found_potential_fork: // (1) ForkStay bbM dbgln_if(REGEX_DEBUG, "Examining block {} from {} to {}", i, block.start, block.end); auto& first_op = bytecode.get_opcode(state); if (first_op.opcode_id() != OpCodeId::ForkStay) { dbgln_if(REGEX_DEBUG, " did not find ForkStay at {}", state.instruction_position); continue; } auto fork_ip = state.instruction_position; auto& fork_op = to(first_op); // Find the actual following block by the fork target auto fork_target = fork_ip + fork_op.size() + fork_op.offset(); size_t following_block_idx = 0; bool found_following_block = false; for (size_t j = 0; j < basic_blocks.size(); ++j) { if (basic_blocks[j].start == fork_target) { if (basic_blocks[j].start <= basic_blocks[j].end) { following_block_idx = j; found_following_block = true; break; } continue; } } if (!found_following_block) { dbgln_if(REGEX_DEBUG, " did not find non-empty following block for fork target {}", fork_target); continue; } auto const& following_block = basic_blocks[following_block_idx]; dbgln_if(REGEX_DEBUG, " Fork target {} is in block {} (from {} to {})", fork_target, following_block_idx, following_block.start, following_block.end); state.instruction_position += first_op.size(); // (2) Checkpoint p auto& second_op = bytecode.get_opcode(state); if (second_op.opcode_id() != OpCodeId::Checkpoint) { dbgln_if(REGEX_DEBUG, " did not find Checkpoint at {} (found opcode {})", state.instruction_position, (int)second_op.opcode_id()); continue; } auto checkpoint_ip = state.instruction_position; auto checkpoint_id = to(second_op).id(); state.instruction_position += second_op.size(); // (3) Compare AnyChar auto& third_op = bytecode.get_opcode(state); if (third_op.opcode_id() != OpCodeId::Compare) { dbgln_if(REGEX_DEBUG, " did not find Compare at {} (found opcode {})", state.instruction_position, (int)third_op.opcode_id()); continue; } auto compare_ip = state.instruction_position; auto& compare_op = to(third_op); auto flat_compares = compare_op.flat_compares(); if (flat_compares.size() != 1 || flat_compares[0].type != CharacterCompareType::AnyChar) { dbgln_if(REGEX_DEBUG, " Compare at {} is not AnyChar", state.instruction_position); continue; } state.instruction_position += third_op.size(); // (3.5) Skip FailIfEmpty if present { auto& maybe_fail_op = bytecode.get_opcode(state); if (maybe_fail_op.opcode_id() == OpCodeId::FailIfEmpty) state.instruction_position += maybe_fail_op.size(); } // (4) JumpNonEmpty back to ForkStay auto& fourth_op = bytecode.get_opcode(state); if (fourth_op.opcode_id() != OpCodeId::JumpNonEmpty) { dbgln_if(REGEX_DEBUG, " did not find JumpNonEmpty at {} (found opcode {})", state.instruction_position, (int)fourth_op.opcode_id()); continue; } auto jump_ip = state.instruction_position; auto& jump_op = to(fourth_op); if (jump_ip + jump_op.size() + jump_op.offset() != fork_ip) { dbgln_if(REGEX_DEBUG, " JumpNonEmpty at {} does not jump back to ForkStay at {} (instead jumps to {})", state.instruction_position, fork_ip, jump_ip + jump_op.size() + jump_op.offset()); continue; } if ((size_t)jump_op.checkpoint() != checkpoint_id) { dbgln_if(REGEX_DEBUG, " JumpNonEmpty at {} does not reference Checkpoint id {} (instead references {})", state.instruction_position, checkpoint_id, jump_op.checkpoint()); continue; } dbgln_if(REGEX_DEBUG, " Found .* pattern from IP {} to {}", fork_ip, jump_ip + jump_op.size()); // The following block must contain a Compare C, with only non-matching ops in between state.instruction_position = following_block.start; while (state.instruction_position <= following_block.end) { auto& op = bytecode.get_opcode(state); switch (op.opcode_id()) { case OpCodeId::Checkpoint: case OpCodeId::Save: case OpCodeId::SaveLeftCaptureGroup: case OpCodeId::SaveRightCaptureGroup: case OpCodeId::SaveRightNamedCaptureGroup: case OpCodeId::ClearCaptureGroup: state.instruction_position += op.size(); continue; case OpCodeId::Compare: { auto& following_compare_op = to(op); auto following_compares = following_compare_op.flat_compares(); StaticallyInterpretedCompares compares; if (!interpret_compares(following_compares, compares, &bytecode, true)) { dbgln_if(REGEX_DEBUG, " could not statically interpret compares at {} in following block", state.instruction_position); goto next_block; } // Must be able to pull a single code point from this compare (i.e. a single 1-long range, no negations, no char classes, and no unicode properties) if (compares.ranges.size() != 1 || !compares.negated_ranges.is_empty() || !compares.char_classes.is_empty() || !compares.negated_char_classes.is_empty() || compares.has_any_unicode_property || !compares.unicode_general_categories.is_empty() || !compares.unicode_properties.is_empty() || !compares.unicode_scripts.is_empty() || !compares.unicode_script_extensions.is_empty() || !compares.negated_unicode_general_categories.is_empty() || !compares.negated_unicode_properties.is_empty() || !compares.negated_unicode_scripts.is_empty() || !compares.negated_unicode_script_extensions.is_empty()) { dbgln_if(REGEX_DEBUG, " compares at {} in following block are too complex to rewrite as SeekTo", state.instruction_position); goto next_block; } auto it = compares.ranges.begin(); if (it.key() != *it) { // Not a single code point dbgln_if(REGEX_DEBUG, " compares at {} in following block are a range, not a single code point ({}..{})", state.instruction_position, it.key(), *it); goto next_block; } auto seeked_code_point = it.key(); candidates.set(fork_ip, { fork_ip, checkpoint_ip, compare_ip, jump_ip, following_block.start, checkpoint_id, seeked_code_point }); dbgln_if(REGEX_DEBUG, " Found sequence from {} to {} followed by Compare '{}' at {}, can rewrite as SeekTo", fork_ip, jump_ip + 4, (char)seeked_code_point, state.instruction_position); goto next_block; } default: dbgln_if(REGEX_DEBUG, " Hit non-matching, non-skippable opcode {} at {} in following block", (int)op.opcode_id(), state.instruction_position); goto next_block; } } next_block: continue; } dbgln_if(REGEX_DEBUG, "Found {} dot-star sequences to rewrite as SeekTo", candidates.size()); if (candidates.is_empty()) return; BytecodeRewriter rewriter(bytecode, pattern_value); struct Range { size_t start_ip; size_t end_ip; }; Vector ranges_to_skip; Vector replacements; ranges_to_skip.ensure_capacity(candidates.size()); replacements.ensure_capacity(candidates.size()); for (auto& [_, candidate] : candidates) { ranges_to_skip.empend(candidate.fork_ip, candidate.jump_ip + 4); // JumpNonEmpty = 4 ByteCode replacement; replacement.empend(static_cast(OpCodeId::RSeekTo)); replacement.empend(candidate.seek_code_point); replacement.empend(static_cast(OpCodeId::ForkStay)); replacement.empend(static_cast(-4)); // Offset back to RSeekTo replacements.append(move(replacement)); } parser_result.bytecode = rewriter.rebuild(bytecode, ranges_to_skip.span(), replacements.span()); if constexpr (REGEX_DEBUG) { dbgln("After dot-star rewrite as SeekTo:"); RegexDebug dbg; dbg.print_bytecode(*this); } } template void Regex::rewrite_simple_compares(BasicBlockList const& basic_blocks) { // If a Compare opcode only has a single compare and that's a match opcode // we can rewrite it as a CompareSimple to avoid the overhead of handling multiple compares: // Compare argc=1 args=S // Char 'a' // --> // CompareSimple args=S // Char 'a' auto& bytecode = parser_result.bytecode.get(); if (basic_blocks.is_empty()) return; struct SimpleCompareCandidate { size_t compare_ip; Vector compare_data; }; Vector candidates; auto state = MatchState::only_for_enumeration(); for (auto const& block : basic_blocks) { for (state.instruction_position = block.start; state.instruction_position <= block.end;) { auto current_ip = state.instruction_position; auto& opcode = bytecode.get_opcode(state); if (opcode.opcode_id() == OpCodeId::Compare) { auto& compare = to(opcode); auto flat_compares = compare.flat_compares(); if (flat_compares.size() == 1 && !first_is_one_of(flat_compares[0].type, CharacterCompareType::And, CharacterCompareType::Or, CharacterCompareType::Inverse, CharacterCompareType::TemporaryInverse, CharacterCompareType::Subtract, CharacterCompareType::Undefined)) { auto slice = bytecode.spans().slice(current_ip + 2, opcode.size() - 2); Vector data; data.ensure_capacity(slice.size()); for (auto value : slice) data.append(value); candidates.append({ current_ip, move(data) }); // +2 to skip opcode id and argc } } state.instruction_position += opcode.size(); } } if (candidates.is_empty()) return; dbgln_if(REGEX_DEBUG, "Found {} simple compare candidates to rewrite", candidates.size()); BytecodeRewriter rewriter(bytecode, pattern_value); for (auto& candidate : candidates) { auto& instr = *rewriter.instructions.find_if([&](auto& i) { return i.old_ip == candidate.compare_ip; }); instr.skip = true; } size_t candidate_index = 0; auto insert_replacement = [&](auto const& instr, ByteCode& result) { while (candidate_index < candidates.size()) { auto& candidate = candidates[candidate_index]; if (instr.old_ip == candidate.compare_ip) { result.empend(static_cast(OpCodeId::CompareSimple)); result.extend(move(candidate.compare_data)); candidate_index++; return; } if (instr.old_ip < candidate.compare_ip) return; candidate_index++; } }; parser_result.bytecode = rewriter.rebuild(bytecode, move(insert_replacement)); if constexpr (REGEX_DEBUG) { dbgln("After simple compare rewrite:"); RegexDebug dbg; dbg.print_bytecode(*this); } } void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right) { Array alternatives; alternatives[0] = move(left); alternatives[1] = move(right); append_alternation(target, alternatives); } template using OrderedHashMapForTrie = OrderedHashMap; void Optimizer::append_alternation(ByteCode& target, Span alternatives) { // Assume we have N alternatives A0..AN, each with M basic blocks bb0..bbM, each with I instructions 0..I (denoted Ai.bbj[k]) // We can create the alternation is two ways: // - Lay them out sequentially, such that A0 is tried, then A1, then A2, etc. // - Generate a prefix tree for A*.bb*[*], and walk the tree at runtime. // For the first case, assuming we have two A0.bb0[0..2] and A1.bb0[0..2]: // out.bb0: // ForkStay out.bb1 // A0.bb0[*] // Jump out.bb2 // out.bb1: // A1.bb0[*] // out.bb2: // // For the second case, assuming the following alternatives: // A0.bb0: // Compare 'a' // Compare 'b' // Compare 'd' // A1.bb0: // Compare 'a' // Compare 'c' // Compare 'd' // We can first generate a prefix tree (trie here), with each node denoted by [insn, insn*]: // (root) // |- [A0.bb0[0], A1.bb0[0]] // | |- [A0.bb0[1]] // | | |- [A0.bb0[2]] // | |- [A1.bb0[1]] // | | |- [A1.bb0[2]] // i.e. the first instruction of A0 and A1 are the same, so we can merge them into one node; // everything following that is different (A1.bb0[2] is not considered equivalent to A0.bb0[2] as they are jumped-to by different instructions, // in this case their previous instruction) // Then, each trie node N { insn, children } can be represented as: // out for N: // N.insn[*] // ForkJump out for N.children[0] // ForkJump out for N.children[1] // ... // or if there's a single child, we can directly jump to it: // out for N: // if N.children.size() == 1 // N.insn[*] // Jump out for N.children[0] // For our example, this would yield: // out for root: // Jump out for [A0.bb0[0], A1.bb0[0]] // out for [A0.bb0[0], A1.bb0[0]]: // Compare 'a' // ForkJump out for A0.bb0[1] // ForkJump out for A1.bb0[1] // out for A0.bb0[1]: // Compare 'b' // Jump out for A0.bb0[2] // out for A1.bb0[1]: // Compare 'c' // Jump out for A1.bb0[2] // out for A0.bb0[2]: // Compare 'd' // out for A1.bb0[2]: // Compare 'd' if (alternatives.size() == 0) return; if (alternatives.size() == 1) return target.extend(move(alternatives[0])); target.merge_string_tables_from(alternatives); if (all_of(alternatives, [](auto& x) { return x.is_empty(); })) return; for (auto& entry : alternatives) entry.flatten(); #if REGEX_DEBUG ScopeLogger log; warnln("Alternations:"); RegexDebug dbg; for (auto& entry : alternatives) { warnln("----------"); dbg.print_bytecode(entry); } ScopeGuard print_at_end { [&] { warnln("======================"); RegexDebug dbg; dbg.print_bytecode(target); } }; #endif // First, find incoming jump edges. // We need them for two reasons: // - We need to distinguish between insn-A-jumped-to-by-insn-B and insn-A-jumped-to-by-insn-C (as otherwise we'd break trie invariants) // - We need to know which jumps to patch when we're done struct JumpEdge { Span jump_insn; }; Vector>> incoming_jump_edges_for_each_alternative; incoming_jump_edges_for_each_alternative.resize(alternatives.size()); auto has_any_backwards_jump = false; auto state = MatchState::only_for_enumeration(); for (size_t i = 0; i < alternatives.size(); ++i) { auto& alternative = alternatives[i]; // Add a jump to the "end" of the block; this is implicit in the bytecode, but we need it to be explicit in the trie. // Jump{offset=0} alternative.append(static_cast(OpCodeId::Jump)); alternative.append(0); auto& incoming_jump_edges = incoming_jump_edges_for_each_alternative[i]; auto alternative_bytes = alternative.spans<1>().singular_span(); for (state.instruction_position = 0; state.instruction_position < alternative.size();) { auto& opcode = alternative.get_opcode(state); auto opcode_bytes = alternative_bytes.slice(state.instruction_position, opcode.size()); switch (opcode.opcode_id()) { case OpCodeId::Jump: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::JumpNonEmpty: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::ForkJump: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::ForkStay: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::ForkReplaceJump: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::ForkReplaceStay: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::ForkIf: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(cast_opcode.offset() + cast_opcode.size() + state.instruction_position).append({ opcode_bytes }); has_any_backwards_jump |= cast_opcode.offset() < 0; break; } case OpCodeId::Repeat: { auto const& cast_opcode = to(opcode); incoming_jump_edges.ensure(state.instruction_position - cast_opcode.offset()).append({ opcode_bytes }); has_any_backwards_jump = true; break; } default: break; } state.instruction_position += opcode.size(); } } struct QualifiedIP { size_t alternative_index; size_t instruction_position; bool operator==(QualifiedIP const& other) const = default; }; struct NodeMetadataEntry { QualifiedIP ip; NonnullOwnPtr first_compare_from_here; }; using Tree = Trie, Vector, Traits>, void, OrderedHashMapForTrie>; Tree trie { {} }; // Root node is empty, key{ instruction_bytes, dependent_instruction_bytes... } -> IP size_t common_hits = 0; size_t total_nodes = 0; size_t total_bytecode_entries_in_tree = 0; for (size_t i = 0; i < alternatives.size(); ++i) { auto& alternative = alternatives[i]; auto& incoming_jump_edges = incoming_jump_edges_for_each_alternative[i]; auto* active_node = ≜ auto alternative_span = alternative.spans<1>().singular_span(); for (state.instruction_position = 0; state.instruction_position < alternative_span.size();) { total_nodes += 1; auto& opcode = alternative.get_opcode(state); auto opcode_bytes = alternative_span.slice(state.instruction_position, opcode.size()); Vector> node_key_bytes; node_key_bytes.append(opcode_bytes); if (auto edges = incoming_jump_edges.get(state.instruction_position); edges.has_value()) { for (auto& edge : *edges) node_key_bytes.append(edge.jump_insn); } active_node = static_cast(MUST(active_node->ensure_child(DisjointSpans { move(node_key_bytes) }, [] -> Vector { return {}; }))); auto next_compare = [&alternative, &state](StaticallyInterpretedCompares& compares) { TemporaryChange state_change { state.instruction_position, state.instruction_position }; auto* opcode = &alternative.get_opcode(state); auto opcode_id = opcode->opcode_id(); while (opcode_id == OpCodeId::Checkpoint || opcode_id == OpCodeId::SaveLeftCaptureGroup || opcode_id == OpCodeId::SaveRightCaptureGroup || opcode_id == OpCodeId::SaveRightNamedCaptureGroup || opcode_id == OpCodeId::Save) { state.instruction_position += opcode->size(); opcode = &alternative.get_opcode(state); opcode_id = opcode->opcode_id(); } // We found something functional, if it's a compare, we need to care. if (opcode_id != OpCodeId::Compare) return; auto flat_compares = to(*opcode).flat_compares(); interpret_compares(flat_compares, compares); }; auto node_metadata = NodeMetadataEntry { { i, state.instruction_position }, make() }; auto& metadata = active_node->metadata_value(); if (metadata.is_empty()) total_bytecode_entries_in_tree += opcode.size(); else common_hits++; metadata.append(move(node_metadata)); next_compare(*active_node->metadata_value().last().first_compare_from_here); state.instruction_position += opcode.size(); } } if constexpr (REGEX_DEBUG) { Function print_tree = [&](decltype(trie)& node, size_t indent = 0) mutable { ByteString name = "(no ip)"; ByteString insn; if (node.has_metadata()) { name = ByteString::formatted( "{}@{} ({} node{})", node.metadata_value().first().ip.instruction_position, node.metadata_value().first().ip.alternative_index, node.metadata_value().size(), node.metadata_value().size() == 1 ? "" : "s"); auto state = MatchState::only_for_enumeration(); state.instruction_position = node.metadata_value().first().ip.instruction_position; auto& opcode = alternatives[node.metadata_value().first().ip.alternative_index].get_opcode(state); insn = ByteString::formatted("{} {}", opcode.to_byte_string(), opcode.arguments_string()); } dbgln("{:->{}}| {} -- {}", "", indent * 2, name, insn); for (auto& child : node.children()) print_tree(static_cast(*child.value), indent + 1); }; print_tree(trie, 0); } // This is really only worth it if we don't blow up the size by the 2-extra-instruction-per-node scheme, similarly, if no nodes are shared, we're better off not using a tree. auto tree_cost = (total_nodes - common_hits) * 2; auto chain_cost = total_bytecode_entries_in_tree + alternatives.size() * 2; dbgln_if(REGEX_DEBUG, "Total nodes: {}, common hits: {} (tree cost = {}, chain cost = {})", total_nodes, common_hits, tree_cost, chain_cost); // Make sure we're not breaking the order requirements (a should be tried before b in a|b) Queue nodes_to_visit; nodes_to_visit.enqueue(&trie); while (!nodes_to_visit.is_empty()) { auto& node = *nodes_to_visit.dequeue(); auto& children = node.children(); for (auto& entry : children) nodes_to_visit.enqueue(entry.value.ptr()); // If the children are not sorted right, we've got a problem. if (children.size() <= 1) continue; size_t max_index = 0; NodeMetadataEntry const* child_with_max_index = nullptr; for (auto& entry : children) { auto& child = *entry.value; if (child.has_metadata()) { for (auto& child_entry : child.metadata_value()) { if (max_index > child_entry.ip.alternative_index) { // We have a problem, an alternative later in the list is being tried before an earlier one. // we can't use this trie...unless the first compare in this child is not the same as the one in the entry with max-index // then there's no overlap and the order doesn't matter anyhow. if (!has_overlap(*child_with_max_index->first_compare_from_here, *child_entry.first_compare_from_here)) { // We can use this trie after all. continue; } tree_cost = NumericLimits::max(); goto exit_useless_loop; } max_index = child_entry.ip.alternative_index; child_with_max_index = &child_entry; } } } continue; exit_useless_loop: break; } if (common_hits == 0 || tree_cost > chain_cost) { dbgln_if(REGEX_DEBUG, "Choosing sequential alternation layout over trie-based layout"); // It's better to lay these out as a normal sequence of instructions. // We can avoid trying alternatives that we know cannot match in certain cases: // - If the alternative starts with an assertion, we can lift the assertion to the fork op itself (currently only ^). Vector fork_conditions; fork_conditions.resize_with_default_value(alternatives.size(), ForkIfCondition::Invalid); for (size_t i = 0; i < alternatives.size(); ++i) { auto& alternative = alternatives[i]; auto state = MatchState::only_for_enumeration(); state.instruction_position = 0; auto& first_opcode = alternative.get_opcode(state); if (first_opcode.opcode_id() == OpCodeId::CheckBegin) fork_conditions[i] = ForkIfCondition::AtStartOfLine; } Vector jump_op_positions; Vector jump_sizes; jump_op_positions.resize(alternatives.size() - 1); jump_sizes.resize(alternatives.size() - 1); for (size_t i = 1; i < alternatives.size(); ++i) { jump_op_positions[i - 1] = target.size(); if (fork_conditions[i - 1] != ForkIfCondition::Invalid) { jump_sizes[i - 1] = 4; target.empend(static_cast(OpCodeId::ForkIf)); target.empend(0u); // To be filled later. target.empend(static_cast(OpCodeId::ForkJump)); target.empend(static_cast(fork_conditions[i - 1])); } else { jump_sizes[i - 1] = 2; target.empend(static_cast(OpCodeId::ForkJump)); target.empend(0u); // To be filled later. } } bool seen_one_empty = false; Vector jump_to_end_patch_positions; jump_to_end_patch_positions.resize_with_default_value(alternatives.size(), NumericLimits::max()); for (size_t i = alternatives.size(); i > 0; --i) { auto& chunk = alternatives[i - 1]; if (chunk.is_empty()) { if (seen_one_empty) continue; seen_one_empty = true; } if (i < alternatives.size()) { auto this_block_start = target.size(); auto position = jump_op_positions[i - 1]; auto jump_size = jump_sizes[i - 1]; target[position + 1] = static_cast(this_block_start - position - jump_size); } target.extend(move(chunk)); target.empend(static_cast(OpCodeId::Jump)); target.empend(0u); // Jump to the _END label jump_to_end_patch_positions[i - 1] = target.size() - 1; } auto end_position = target.size(); for (size_t i = 0; i < alternatives.size(); ++i) { if (auto& position = jump_to_end_patch_positions[i]; position != NumericLimits::max()) target[position] = static_cast(end_position - (position + 1)); } } else { dbgln_if(REGEX_DEBUG, "Choosing trie-based alternation layout"); target.ensure_capacity(total_bytecode_entries_in_tree + common_hits * 6); auto node_is = [](Tree const* node, QualifiedIP ip) { return node->metadata_value().span().first_matching([&](auto& entry) { return entry.ip == ip; }).has_value(); }; struct Patch { QualifiedIP source_ip; size_t target_ip; size_t size_delta { 0 }; bool done { false }; }; Vector patch_locations; patch_locations.ensure_capacity(total_nodes); HashMap>> instruction_positions; if (has_any_backwards_jump) MUST(instruction_positions.try_ensure_capacity(alternatives.size())); auto ip_mapping_for_alternative = [&](size_t i) -> RedBlackTree& { return *instruction_positions.ensure(i, [] { return make>(); }); }; auto add_patch_point = [&](Tree const* node, size_t target_ip) { if (!node->has_metadata()) return; patch_locations.append({ node->metadata_value().first().ip, target_ip }); }; Vector nodes_to_visit; nodes_to_visit.append(&trie); // each node: // node.re // forkjump child1 // forkjump child2 // ... while (!nodes_to_visit.is_empty()) { auto const* node = nodes_to_visit.take_last(); for (auto& patch : patch_locations) { if (!patch.done && node_is(node, patch.source_ip)) { auto value = static_cast(target.size() - patch.target_ip - 1 - patch.size_delta); if (value == 0) target[patch.target_ip - 1] = static_cast(OpCodeId::Jump); target[patch.target_ip] = value; patch.done = true; } } if (!node->value().individual_spans().is_empty()) { auto insn_bytes = node->value().individual_spans().first(); target.ensure_capacity(target.size() + insn_bytes.size()); state.instruction_position = target.size(); target.append(insn_bytes); if (has_any_backwards_jump) { for (auto& entry : node->metadata_value()) ip_mapping_for_alternative(entry.ip.alternative_index).insert(entry.ip.instruction_position, state.instruction_position); } auto& opcode = target.get_opcode(state); ssize_t jump_offset; auto is_jump = true; auto patch_location = state.instruction_position + 1; bool should_negate = false; size_t size_delta = opcode.size() - 2; switch (opcode.opcode_id()) { case OpCodeId::Jump: jump_offset = to(opcode).offset(); break; case OpCodeId::JumpNonEmpty: jump_offset = to(opcode).offset(); break; case OpCodeId::ForkJump: jump_offset = to(opcode).offset(); break; case OpCodeId::ForkStay: jump_offset = to(opcode).offset(); break; case OpCodeId::ForkReplaceJump: jump_offset = to(opcode).offset(); break; case OpCodeId::ForkReplaceStay: jump_offset = to(opcode).offset(); break; case OpCodeId::ForkIf: jump_offset = to(opcode).offset(); break; case OpCodeId::Repeat: jump_offset = static_cast(0) - static_cast(to(opcode).offset()) - static_cast(opcode.size()); should_negate = true; break; default: is_jump = false; break; } if (is_jump) { VERIFY(node->has_metadata()); if (node->metadata_value().size() > 1) target[patch_location] = static_cast(0); // Fall through instead. auto only_one = node->metadata_value().size() == 1; auto patch_size = opcode.size() - 1; for (auto& entry : node->metadata_value()) { auto& [alternative_index, instruction_position] = entry.ip; if (!only_one) { target.append(static_cast(OpCodeId::ForkJump)); patch_location = target.size(); should_negate = false; patch_size = 1; target.append(static_cast(0)); } auto intended_jump_ip = instruction_position + jump_offset + opcode.size(); if (jump_offset < 0) { VERIFY(has_any_backwards_jump); // We should've already seen this instruction, so we can just patch it in. auto& ip_mapping = ip_mapping_for_alternative(alternative_index); auto target_ip = ip_mapping.find(intended_jump_ip); if (!target_ip) { RegexDebug dbg; size_t x = 0; for (auto& entry : alternatives) { warnln("----------- {} ----------", x++); dbg.print_bytecode(entry); } dbgln("Regex Tree / Unknown backwards jump: {}@{} -> {}", instruction_position, alternative_index, intended_jump_ip); VERIFY_NOT_REACHED(); } ssize_t target_value = *target_ip - patch_location - patch_size; if (should_negate) target_value = -target_value - opcode.size(); target[patch_location] = static_cast(target_value); } else { patch_locations.append({ QualifiedIP { alternative_index, intended_jump_ip }, patch_location, size_delta }); } } } } for (auto const& child : node->children()) { auto* child_node = static_cast(child.value.ptr()); target.append(static_cast(OpCodeId::ForkJump)); add_patch_point(child_node, target.size()); target.append(static_cast(0)); nodes_to_visit.append(child_node); } } for (auto& patch : patch_locations) { if (patch.done) continue; auto& alternative = alternatives[patch.source_ip.alternative_index]; if (patch.source_ip.instruction_position >= alternative.size()) { // This just wants to jump to the end of the alternative, which is fine. // Patch it to jump to the end of the target instead. target[patch.target_ip] = static_cast(target.size() - patch.target_ip - 1); continue; } dbgln("Regex Tree / Unpatched jump: {}@{} -> {}@{}", patch.source_ip.instruction_position, patch.source_ip.alternative_index, patch.target_ip, target[patch.target_ip]); VERIFY_NOT_REACHED(); } } } enum class LookupTableInsertionOutcome { Successful, ReplaceWithAnyChar, TemporaryInversionNeeded, PermanentInversionNeeded, FlushOnInsertion, FinishFlushOnInsertion, CannotPlaceInTable, }; static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree& table, CompareTypeAndValuePair pair) { switch (pair.type) { case CharacterCompareType::Inverse: return LookupTableInsertionOutcome::PermanentInversionNeeded; case CharacterCompareType::TemporaryInverse: return LookupTableInsertionOutcome::TemporaryInversionNeeded; case CharacterCompareType::AnyChar: return LookupTableInsertionOutcome::ReplaceWithAnyChar; case CharacterCompareType::CharClass: return LookupTableInsertionOutcome::CannotPlaceInTable; case CharacterCompareType::Char: table.insert(pair.value, { (u32)pair.value, (u32)pair.value }); break; case CharacterCompareType::CharRange: { CharRange range { pair.value }; table.insert(range.from, range); break; } case CharacterCompareType::EndAndOr: return LookupTableInsertionOutcome::FinishFlushOnInsertion; case CharacterCompareType::And: case CharacterCompareType::Subtract: return LookupTableInsertionOutcome::FlushOnInsertion; case CharacterCompareType::Reference: case CharacterCompareType::NamedReference: case CharacterCompareType::Property: case CharacterCompareType::GeneralCategory: case CharacterCompareType::Script: case CharacterCompareType::ScriptExtension: case CharacterCompareType::StringSet: case CharacterCompareType::Or: return LookupTableInsertionOutcome::CannotPlaceInTable; case CharacterCompareType::Undefined: case CharacterCompareType::RangeExpressionDummy: case CharacterCompareType::String: case CharacterCompareType::LookupTable: VERIFY_NOT_REACHED(); } return LookupTableInsertionOutcome::Successful; } void Optimizer::append_character_class(ByteCode& target, Vector&& pairs) { ByteCode arguments; size_t argument_count = 0; if (pairs.size() <= 1) { for (auto& pair : pairs) { arguments.append(to_underlying(pair.type)); if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse && pair.type != CharacterCompareType::And && pair.type != CharacterCompareType::Or && pair.type != CharacterCompareType::Subtract && pair.type != CharacterCompareType::EndAndOr) arguments.append(pair.value); ++argument_count; } } else { RedBlackTree table; RedBlackTree inverted_table; auto* current_table = &table; auto* current_inverted_table = &inverted_table; bool invert_for_next_iteration = false; bool is_currently_inverted = false; auto flush_tables = [&] { auto merge_overlapping_ranges = [](auto& source) { Optional active_range; Vector result; for (auto& range : source) { if (!active_range.has_value()) { active_range = CharRange(range); continue; } CharRange char_range(range); if (char_range.from <= active_range->to + 1 && char_range.to + 1 >= active_range->from) { active_range = CharRange { min(char_range.from, active_range->from), max(char_range.to, active_range->to) }; } else { result.append(active_range.release_value()); active_range = char_range; } } if (active_range.has_value()) result.append(active_range.release_value()); return result; }; auto append_table = [&](auto& table) { ++argument_count; arguments.append(to_underlying(CharacterCompareType::LookupTable)); auto sensitive_size_index = arguments.size(); auto insensitive_size_index = sensitive_size_index + 1; arguments.append(0); arguments.append(0); auto range_data = merge_overlapping_ranges(table); arguments.extend(range_data); arguments[sensitive_size_index] = range_data.size(); Vector insensitive_data; for (CharRange range : range_data) { for (auto expanded : Unicode::expand_range_case_insensitive(range.from, range.to)) insensitive_data.append(CharRange { expanded.from, expanded.to }); } quick_sort(insensitive_data, [](CharRange a, CharRange b) { return a.from < b.from; }); auto merged_data = merge_overlapping_ranges(insensitive_data); arguments.extend(merged_data); arguments[insensitive_size_index] = merged_data.size(); }; auto contains_regular_table = !table.is_empty(); auto contains_inverted_table = !inverted_table.is_empty(); if (contains_regular_table) append_table(table); if (contains_inverted_table) { ++argument_count; arguments.append(to_underlying(CharacterCompareType::TemporaryInverse)); append_table(inverted_table); } table.clear(); inverted_table.clear(); }; auto flush_on_every_insertion = false; for (auto& value : pairs) { auto should_invert_after_this_iteration = invert_for_next_iteration; invert_for_next_iteration = false; auto insertion_result = insert_into_lookup_table(*current_table, value); switch (insertion_result) { case LookupTableInsertionOutcome::Successful: if (flush_on_every_insertion) flush_tables(); break; case LookupTableInsertionOutcome::ReplaceWithAnyChar: { table.clear(); inverted_table.clear(); arguments.append(to_underlying(CharacterCompareType::AnyChar)); ++argument_count; break; } case LookupTableInsertionOutcome::TemporaryInversionNeeded: swap(current_table, current_inverted_table); invert_for_next_iteration = true; is_currently_inverted = !is_currently_inverted; break; case LookupTableInsertionOutcome::PermanentInversionNeeded: flush_tables(); arguments.append(to_underlying(CharacterCompareType::Inverse)); ++argument_count; break; case LookupTableInsertionOutcome::FlushOnInsertion: case LookupTableInsertionOutcome::FinishFlushOnInsertion: flush_tables(); flush_on_every_insertion = insertion_result == LookupTableInsertionOutcome::FlushOnInsertion; [[fallthrough]]; case LookupTableInsertionOutcome::CannotPlaceInTable: if (is_currently_inverted) { arguments.append(to_underlying(CharacterCompareType::TemporaryInverse)); ++argument_count; } arguments.append(to_underlying(value.type)); if (value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse && value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::And && value.type != CharacterCompareType::Or && value.type != CharacterCompareType::Subtract && value.type != CharacterCompareType::EndAndOr) arguments.append(value.value); ++argument_count; break; } if (should_invert_after_this_iteration) { swap(current_table, current_inverted_table); is_currently_inverted = !is_currently_inverted; } } flush_tables(); } target.empend(static_cast(OpCodeId::Compare)); target.empend(argument_count); // number of arguments target.empend(arguments.size()); // size of arguments target.extend(move(arguments)); } template void Regex::run_optimization_passes(); template void Regex::run_optimization_passes(); template void Regex::run_optimization_passes(); }