2021-09-12 17:30:27 +04:30
/*
* Copyright ( c ) 2021 , Ali Mohammad Pur < mpfard @ serenityos . org >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2023-01-02 21:07:18 +01:00
# include <AK/Debug.h>
2026-01-05 16:14:03 +01:00
# include <AK/Enumerate.h>
2023-07-28 21:02:34 +03:30
# include <AK/Function.h>
2025-12-22 14:00:19 +01:00
# include <AK/GenericShorthands.h>
2025-05-17 21:50:37 +02:00
# include <AK/Queue.h>
2021-09-12 17:30:27 +04:30
# include <AK/QuickSort.h>
# include <AK/RedBlackTree.h>
# include <AK/Stack.h>
2025-05-17 21:50:37 +02:00
# include <AK/TemporaryChange.h>
2023-07-28 21:02:34 +03:30
# include <AK/Trie.h>
2025-02-24 08:41:04 +01:00
# include <AK/Vector.h>
2021-09-12 17:30:27 +04:30
# include <LibRegex/Regex.h>
# include <LibRegex/RegexBytecodeStreamOptimizer.h>
2026-02-21 17:04:24 +01:00
# include <LibRegex/RegexDebug.h>
2023-07-14 08:13:59 +03:30
# include <LibUnicode/CharacterTypes.h>
2022-02-20 03:25:21 +03:30
# if REGEX_DEBUG
# include <AK / ScopeGuard.h>
# include <AK / ScopeLogger.h>
# endif
2021-09-12 17:30:27 +04:30
namespace regex {
using Detail : : Block ;
2025-12-16 00:27:18 +01:00
struct BytecodeRewriter {
struct Instruction {
size_t old_ip ;
size_t size ;
OpCodeId id ;
bool skip ;
} ;
Vector < Instruction > instructions ;
HashMap < size_t , size_t > new_ip_mapping ;
2026-01-06 18:10:48 +01:00
StringView target_pattern ;
2025-12-16 00:27:18 +01:00
2026-01-06 18:10:48 +01:00
BytecodeRewriter ( ByteCode const & bytecode , StringView pattern = { } )
: target_pattern ( pattern )
2025-12-16 00:27:18 +01:00
{
auto flat = bytecode . flat_data ( ) ;
auto state = MatchState : : only_for_enumeration ( ) ;
for ( size_t old_ip = 0 ; old_ip < flat . size ( ) ; ) {
state . instruction_position = old_ip ;
auto & op = bytecode . get_opcode ( state ) ;
auto sz = op . size ( ) ;
instructions . append ( { old_ip , sz , op . opcode_id ( ) , false } ) ;
old_ip + = sz ;
}
}
void mark_range_for_skip ( size_t start , size_t end )
{
for ( auto & instr : instructions ) {
if ( instr . old_ip > = start & & instr . old_ip < end )
instr . skip = true ;
}
}
void build_ip_mapping ( ByteCode const & bytecode , Span < ByteCode const > replacements )
{
new_ip_mapping . ensure_capacity ( instructions . size ( ) + 1 ) ;
size_t current_new_ip = 0 ;
auto replacements_it = replacements . begin ( ) ;
for ( auto & instr : instructions ) {
new_ip_mapping . set ( instr . old_ip , current_new_ip ) ;
auto & replacement = * replacements_it ;
+ + replacements_it ;
if ( ! instr . skip )
current_new_ip + = instr . size ;
else
current_new_ip + = replacement . size ( ) ;
}
new_ip_mapping . set ( bytecode . size ( ) , current_new_ip ) ;
}
2025-12-25 15:02:57 +01:00
template < typename Range >
requires ( requires ( Range r ) { r . start_ip ; r . end_ip ; } )
void build_ip_mapping ( ByteCode const & bytecode , Span < Range const > replacement_ranges , Span < ByteCode const > replacements )
{
new_ip_mapping . ensure_capacity ( instructions . size ( ) + 1 ) ;
size_t current_new_ip = 0 ;
auto instruction_it = instructions . begin ( ) ;
for ( auto i = 0u z ; i < replacements . size ( ) ; + + i ) {
auto & range = replacement_ranges [ i ] ;
auto & replacement = replacements [ i ] ;
while ( instruction_it ! = instructions . end ( ) ) {
auto & instr = * instruction_it ;
if ( instr . old_ip > = range . start_ip ) {
ASSERT ( instr . old_ip < range . end_ip ) ;
new_ip_mapping . set ( instr . old_ip , current_new_ip ) ;
break ;
}
new_ip_mapping . set ( instr . old_ip , current_new_ip ) ;
current_new_ip + = instr . size ;
+ + instruction_it ;
}
current_new_ip + = replacement . size ( ) ;
// Skip instructions in the replacement range
while ( instruction_it ! = instructions . end ( ) ) {
auto & instr = * instruction_it ;
if ( instr . old_ip > = range . end_ip )
break ;
+ + instruction_it ;
}
}
// Map any remaining instructions
for ( ; instruction_it ! = instructions . end ( ) ; + + instruction_it ) {
auto & instr = * instruction_it ;
new_ip_mapping . set ( instr . old_ip , current_new_ip ) ;
current_new_ip + = instr . size ;
}
new_ip_mapping . set ( bytecode . size ( ) , current_new_ip ) ;
}
template < typename Range >
requires ( requires ( Range r ) { r . start_ip ; r . end_ip ; } )
ByteCode rebuild ( ByteCode const & bytecode , Span < Range > replacement_ranges , Span < ByteCode const > replacements )
{
// Assumes that replacement_ranges and replacements are the same size
// As well as in order
VERIFY ( replacement_ranges . size ( ) = = replacements . size ( ) ) ;
auto flat = bytecode . flat_data ( ) ;
ByteCode result ;
result . merge_string_tables_from ( { & bytecode , 1 } ) ;
size_t total_new_size = bytecode . size ( ) ;
// FIXME: Get a zip(it...) helper
for ( auto i = 0u z ; i < replacement_ranges . size ( ) ; + + i ) {
mark_range_for_skip ( replacement_ranges [ i ] . start_ip , replacement_ranges [ i ] . end_ip ) ;
total_new_size - = ( replacement_ranges [ i ] . end_ip - replacement_ranges [ i ] . start_ip ) ;
total_new_size + = replacements [ i ] . size ( ) ;
}
build_ip_mapping < Range > ( bytecode , replacement_ranges , replacements ) ;
result . ensure_capacity ( total_new_size ) ;
// FIXME: Use a zip(...) helper
auto instructions_it = instructions . begin ( ) ;
for ( auto i = 0u z ; i < replacement_ranges . size ( ) ; + + i ) {
auto & range = replacement_ranges [ i ] ;
auto & replacement = replacements [ i ] ;
// Append and adjust all instructions before the replacement range
for ( ; instructions_it ! = instructions . end ( ) ; + + instructions_it ) {
auto & instr = * instructions_it ;
if ( instr . old_ip > = range . start_ip ) {
ASSERT ( instr . old_ip < range . end_ip ) ;
+ + instructions_it ;
break ;
}
VERIFY ( instr . skip = = false ) ;
auto slice = Vector < ByteCodeValueType > { flat . slice ( instr . old_ip , instr . size ) } ;
adjust_jump_in_slice ( bytecode , slice , instr ) ;
result . append ( move ( slice ) ) ;
}
// Finally insert the replacement
result . extend ( replacement ) ;
// Skip instructions in the replacement range
while ( instructions_it ! = instructions . end ( ) ) {
auto & instr = * instructions_it ;
if ( instr . old_ip > = range . end_ip )
break ;
+ + instructions_it ;
}
}
// Append any remaining instructions
for ( ; instructions_it ! = instructions . end ( ) ; + + instructions_it ) {
auto & instr = * instructions_it ;
auto slice = Vector < ByteCodeValueType > { flat . slice ( instr . old_ip , instr . size ) } ;
adjust_jump_in_slice ( bytecode , slice , instr ) ;
result . append ( move ( slice ) ) ;
VERIFY ( instr . skip = = false ) ;
}
result . flatten ( ) ;
return result ;
}
2025-12-16 00:27:18 +01:00
ByteCode rebuild ( ByteCode const & bytecode , Function < void ( Instruction const & , ByteCode & ) > insert_replacement = nullptr )
{
auto flat = bytecode . flat_data ( ) ;
ByteCode result ;
result . merge_string_tables_from ( { & bytecode , 1 } ) ;
Vector < ByteCode > replacements ;
replacements . resize_with_default_value ( instructions . size ( ) , ByteCode { } ) ;
size_t total_new_size = 0 ;
for ( auto const & [ i , instr ] : enumerate ( instructions ) ) {
if ( ! instr . skip ) {
total_new_size + = instr . size ;
} else if ( insert_replacement ) {
ByteCode temp ;
insert_replacement ( instr , temp ) ;
total_new_size + = temp . size ( ) ;
replacements [ i ] = move ( temp ) ;
}
}
build_ip_mapping ( bytecode , replacements ) ;
result . ensure_capacity ( total_new_size ) ;
auto replacements_it = replacements . begin ( ) ;
for ( auto & instr : instructions ) {
auto & replacement = * replacements_it ;
+ + replacements_it ;
if ( instr . skip ) {
result . extend ( move ( replacement ) ) ;
continue ;
}
auto slice = Vector < ByteCodeValueType > { flat . slice ( instr . old_ip , instr . size ) } ;
adjust_jump_in_slice ( bytecode , slice , instr ) ;
result . append ( move ( slice ) ) ;
}
result . flatten ( ) ;
return result ;
}
private :
void adjust_jump_in_slice ( ByteCode const & bytecode , Vector < ByteCodeValueType > & slice , Instruction const & instr )
{
auto adjust = [ & ] ( size_t idx , bool is_repeat ) {
auto old_offset = slice [ idx ] ;
auto target_old = is_repeat
? instr . old_ip - old_offset
: instr . old_ip + instr . size + old_offset ;
if ( ! new_ip_mapping . contains ( target_old ) ) {
2026-01-06 18:10:48 +01:00
dbgln ( " In pattern /{}/ " , target_pattern ) ;
2025-12-16 00:27:18 +01:00
dbgln ( " Target {} not found in new_ip mapping (in {}) " , target_old , instr . old_ip ) ;
RegexDebug < ByteCode > dbg ( stderr ) ;
dbg . print_bytecode ( bytecode ) ;
VERIFY_NOT_REACHED ( ) ;
}
size_t target_new = * new_ip_mapping . get ( target_old ) ;
size_t source_new = * new_ip_mapping . get ( instr . old_ip ) ;
auto new_offset = is_repeat
? source_new - target_new
: target_new - source_new - instr . size ;
slice [ idx ] = static_cast < ByteCodeValueType > ( new_offset ) ;
} ;
switch ( instr . id ) {
case OpCodeId : : Jump :
case OpCodeId : : ForkJump :
case OpCodeId : : ForkStay :
case OpCodeId : : ForkReplaceJump :
case OpCodeId : : ForkReplaceStay :
case OpCodeId : : JumpNonEmpty :
case OpCodeId : : ForkIf :
adjust ( 1 , false ) ;
break ;
case OpCodeId : : Repeat :
adjust ( 1 , true ) ;
break ;
default :
break ;
}
}
} ;
2026-01-05 16:14:03 +01:00
template < typename Parser >
static typename Regex < Parser > : : BasicBlockList split_basic_blocks_for_atomic_groups ( ByteCode const & bytecode )
{
typename Regex < Parser > : : BasicBlockList block_boundaries ;
size_t end_of_last_block = 0 ;
auto bytecode_size = bytecode . size ( ) ;
auto state = MatchState : : only_for_enumeration ( ) ;
state . instruction_position = 0 ;
auto check_jump = [ & ] < template < typename > class T > ( auto const & opcode ) {
auto & op = static_cast < T < ByteCode > const & > ( opcode ) ;
ssize_t jump_offset = op . size ( ) + op . offset ( ) ;
if ( jump_offset > = 0 ) {
block_boundaries . append ( { end_of_last_block , state . instruction_position , " Jump ahead " sv } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
} else {
// This op jumps back, see if that's within this "block".
if ( jump_offset + state . instruction_position > end_of_last_block ) {
// Split the block!
block_boundaries . append ( { end_of_last_block , jump_offset + state . instruction_position , " Jump back 1 " sv } ) ;
block_boundaries . append ( { jump_offset + state . instruction_position , state . instruction_position , " Jump back 2 " sv } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
} else {
// Nope, it's just a jump to another block
block_boundaries . append ( { end_of_last_block , state . instruction_position , " Jump " sv } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
}
}
} ;
for ( ; ; ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
check_jump . template operator ( ) < OpCode_Jump > ( opcode ) ;
break ;
case OpCodeId : : JumpNonEmpty :
check_jump . template operator ( ) < OpCode_JumpNonEmpty > ( opcode ) ;
break ;
case OpCodeId : : ForkJump :
check_jump . template operator ( ) < OpCode_ForkJump > ( opcode ) ;
break ;
case OpCodeId : : ForkStay :
check_jump . template operator ( ) < OpCode_ForkStay > ( opcode ) ;
break ;
case OpCodeId : : ForkIf :
check_jump . template operator ( ) < OpCode_ForkIf > ( opcode ) ;
break ;
case OpCodeId : : FailForks :
block_boundaries . append ( { end_of_last_block , state . instruction_position , " FailForks " sv } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
break ;
case OpCodeId : : Repeat : {
// Repeat produces two blocks, one containing its repeated expr, and one after that.
auto & repeat = static_cast < OpCode_Repeat < ByteCode > const & > ( opcode ) ;
auto repeat_start = state . instruction_position - repeat . offset ( ) ;
if ( repeat_start > end_of_last_block )
block_boundaries . append ( { end_of_last_block , repeat_start , " Repeat " sv } ) ;
block_boundaries . append ( { repeat_start , state . instruction_position , " Repeat after " sv } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
break ;
}
default :
break ;
}
auto next_ip = state . instruction_position + opcode . size ( ) ;
if ( next_ip < bytecode_size )
state . instruction_position = next_ip ;
else
break ;
}
if ( end_of_last_block < bytecode_size )
block_boundaries . append ( { end_of_last_block , bytecode_size , " End " sv } ) ;
quick_sort ( block_boundaries , [ ] ( auto & a , auto & b ) { return a . start < b . start ; } ) ;
return block_boundaries ;
}
2021-09-12 17:30:27 +04:30
template < typename Parser >
void Regex < Parser > : : run_optimization_passes ( )
{
2025-12-16 00:27:18 +01:00
ScopeGuard switch_to_flat = [ & ] {
parser_result . bytecode = FlatByteCode : : from ( move ( parser_result . bytecode . template get < ByteCode > ( ) ) ) ;
} ;
2025-04-23 21:13:30 +02:00
rewrite_with_useless_jumps_removed ( ) ;
2025-12-16 00:27:18 +01:00
auto blocks = split_basic_blocks ( parser_result . bytecode . get < ByteCode > ( ) ) ;
2025-12-22 13:05:47 +01:00
if ( attempt_rewrite_entire_match_as_substring_search ( blocks ) ) {
2023-07-28 20:59:33 +03:30
return ;
2025-12-22 13:05:47 +01:00
}
2023-07-28 20:59:33 +03:30
2021-09-12 17:30:27 +04:30
// Rewrite fork loops as atomic groups
// e.g. a*b -> (ATOMIC a*)b
2026-01-05 16:14:03 +01:00
blocks = split_basic_blocks_for_atomic_groups < Parser > ( parser_result . bytecode . get < ByteCode > ( ) ) ;
2023-07-28 20:59:33 +03:30
attempt_rewrite_loops_as_atomic_groups ( blocks ) ;
2021-09-13 23:45:22 +04:30
2025-12-16 00:33:28 +01:00
// Join adjacent compares that only match single characters into a single compare that matches a string.
blocks = split_basic_blocks ( parser_result . bytecode . get < ByteCode > ( ) ) ;
attempt_rewrite_adjacent_compares_as_string_compare ( blocks ) ;
2025-12-22 13:05:47 +01:00
// Rewrite /.*x/ as a seek to x
blocks = split_basic_blocks ( parser_result . bytecode . get < ByteCode > ( ) ) ;
attempt_rewrite_dot_star_sequences_as_seek ( blocks ) ;
2025-12-22 14:00:19 +01:00
// Simplify compares where possible
blocks = split_basic_blocks ( parser_result . bytecode . get < ByteCode > ( ) ) ;
rewrite_simple_compares ( blocks ) ;
2025-12-16 00:27:18 +01:00
fill_optimization_data ( split_basic_blocks ( parser_result . bytecode . template get < ByteCode > ( ) ) ) ;
2021-09-12 17:30:27 +04:30
}
2025-04-15 21:32:26 +02:00
struct StaticallyInterpretedCompares {
2025-05-17 21:50:37 +02:00
RedBlackTree < u32 , u32 > ranges ;
RedBlackTree < u32 , u32 > negated_ranges ;
HashTable < CharClass > char_classes ;
HashTable < CharClass > negated_char_classes ;
2025-04-15 21:32:26 +02:00
bool has_any_unicode_property = false ;
2025-05-17 21:50:37 +02:00
HashTable < Unicode : : GeneralCategory > unicode_general_categories ;
HashTable < Unicode : : Property > unicode_properties ;
HashTable < Unicode : : Script > unicode_scripts ;
HashTable < Unicode : : Script > unicode_script_extensions ;
HashTable < Unicode : : GeneralCategory > negated_unicode_general_categories ;
HashTable < Unicode : : Property > negated_unicode_properties ;
HashTable < Unicode : : Script > negated_unicode_scripts ;
HashTable < Unicode : : Script > negated_unicode_script_extensions ;
2025-04-15 21:32:26 +02:00
} ;
2025-12-22 13:05:47 +01:00
static bool interpret_compares ( Vector < CompareTypeAndValuePair > const & lhs , StaticallyInterpretedCompares & compares , ByteCodeBase const * bytecode = nullptr , bool as_follow = false )
2025-04-15 21:32:26 +02:00
{
bool inverse { false } ;
bool temporary_inverse { false } ;
bool reset_temporary_inverse { false } ;
auto current_lhs_inversion_state = [ & ] ( ) - > bool { return temporary_inverse ^ inverse ; } ;
2025-05-17 21:50:37 +02:00
auto & lhs_ranges = compares . ranges ;
auto & lhs_negated_ranges = compares . negated_ranges ;
auto & lhs_char_classes = compares . char_classes ;
auto & lhs_negated_char_classes = compares . negated_char_classes ;
2025-04-15 21:32:26 +02:00
auto & has_any_unicode_property = compares . has_any_unicode_property ;
2025-05-17 21:50:37 +02:00
auto & lhs_unicode_general_categories = compares . unicode_general_categories ;
auto & lhs_unicode_properties = compares . unicode_properties ;
auto & lhs_unicode_scripts = compares . unicode_scripts ;
auto & lhs_unicode_script_extensions = compares . unicode_script_extensions ;
auto & lhs_negated_unicode_general_categories = compares . negated_unicode_general_categories ;
auto & lhs_negated_unicode_properties = compares . negated_unicode_properties ;
auto & lhs_negated_unicode_scripts = compares . negated_unicode_scripts ;
auto & lhs_negated_unicode_script_extensions = compares . negated_unicode_script_extensions ;
2025-04-15 21:32:26 +02:00
for ( auto const & pair : lhs ) {
if ( reset_temporary_inverse ) {
reset_temporary_inverse = false ;
temporary_inverse = false ;
} else {
reset_temporary_inverse = true ;
}
switch ( pair . type ) {
case CharacterCompareType : : Inverse :
inverse = ! inverse ;
break ;
case CharacterCompareType : : TemporaryInverse :
temporary_inverse = true ;
reset_temporary_inverse = false ;
break ;
case CharacterCompareType : : AnyChar :
// Special case: if not inverted, AnyChar is always in the range.
if ( ! current_lhs_inversion_state ( ) )
return false ;
break ;
case CharacterCompareType : : Char :
if ( ! current_lhs_inversion_state ( ) )
lhs_ranges . insert ( pair . value , pair . value ) ;
else
lhs_negated_ranges . insert ( pair . value , pair . value ) ;
break ;
2025-12-22 13:05:47 +01:00
case CharacterCompareType : : String : {
if ( ! as_follow )
return false ;
auto string = bytecode - > get_u16_string ( pair . value ) ;
u32 ch = string . code_point_at ( 0 ) ;
if ( ! current_lhs_inversion_state ( ) )
lhs_ranges . insert ( ch , ch ) ;
else
lhs_negated_ranges . insert ( ch , ch ) ;
break ;
}
2025-11-09 13:35:16 +01:00
case CharacterCompareType : : StringSet :
return false ;
2025-04-15 21:32:26 +02:00
case CharacterCompareType : : CharClass :
if ( ! current_lhs_inversion_state ( ) )
lhs_char_classes . set ( static_cast < CharClass > ( pair . value ) ) ;
else
lhs_negated_char_classes . set ( static_cast < CharClass > ( pair . value ) ) ;
break ;
case CharacterCompareType : : CharRange : {
auto range = CharRange ( pair . value ) ;
if ( ! current_lhs_inversion_state ( ) )
lhs_ranges . insert ( range . from , range . to ) ;
else
lhs_negated_ranges . insert ( range . from , range . to ) ;
break ;
}
case CharacterCompareType : : LookupTable :
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return false ;
case CharacterCompareType : : Reference :
2025-07-23 20:48:34 +02:00
case CharacterCompareType : : NamedReference :
2025-04-15 21:32:26 +02:00
// We've handled this before coming here.
break ;
case CharacterCompareType : : Property :
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_properties . set ( static_cast < Unicode : : Property > ( pair . value ) ) ;
else
lhs_negated_unicode_properties . set ( static_cast < Unicode : : Property > ( pair . value ) ) ;
break ;
case CharacterCompareType : : GeneralCategory :
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_general_categories . set ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
else
lhs_negated_unicode_general_categories . set ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
break ;
case CharacterCompareType : : Script :
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_scripts . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
else
lhs_negated_unicode_scripts . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
break ;
case CharacterCompareType : : ScriptExtension :
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_script_extensions . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
else
lhs_negated_unicode_script_extensions . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
break ;
case CharacterCompareType : : Or :
case CharacterCompareType : : EndAndOr :
// These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below).
break ;
case CharacterCompareType : : And :
2025-11-09 13:35:16 +01:00
case CharacterCompareType : : Subtract :
2025-04-15 21:32:26 +02:00
// FIXME: These are too difficult to handle, so bail out.
return false ;
case CharacterCompareType : : Undefined :
case CharacterCompareType : : RangeExpressionDummy :
// These do not occur in valid bytecode.
VERIFY_NOT_REACHED ( ) ;
}
}
return true ;
}
template < class Parser >
void Regex < Parser > : : fill_optimization_data ( BasicBlockList const & blocks )
{
if ( blocks . is_empty ( ) )
return ;
if constexpr ( REGEX_DEBUG ) {
dbgln ( " Pulling out optimization data from bytecode: " ) ;
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2025-04-15 21:32:26 +02:00
dbg . print_bytecode ( * this ) ;
for ( auto const & block : blocks )
dbgln ( " block from {} to {} (comment: {}) " , block . start , block . end , block . comment ) ;
}
ScopeGuard print = [ & ] {
if constexpr ( REGEX_DEBUG ) {
dbgln ( " Optimization data: " ) ;
if ( parser_result . optimization_data . starting_ranges . is_empty ( ) )
dbgln ( " ; - no starting ranges " ) ;
for ( auto const & range : parser_result . optimization_data . starting_ranges )
dbgln ( " - starting range: {}-{} " , range . from , range . to ) ;
dbgln ( " ; - only start of line: {} " , parser_result . optimization_data . only_start_of_line ) ;
}
} ;
2025-12-16 00:27:18 +01:00
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
2025-04-15 21:32:26 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
auto block = blocks . first ( ) ;
for ( state . instruction_position = block . start ; state . instruction_position < block . end ; ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Compare : {
2025-12-16 01:42:17 +01:00
auto & compare = to < OpCode_Compare > ( opcode ) ;
if ( compare . arguments_count ( ) = = 0 )
return ; // This matches 'nothing', so there are no starting ranges that can satisfy it.
auto flat_compares = compare . flat_compares ( ) ;
2025-04-15 21:32:26 +02:00
StaticallyInterpretedCompares compares ;
if ( ! interpret_compares ( flat_compares , compares ) )
return ; // No idea, the bytecode is too complex.
if ( compares . has_any_unicode_property )
return ; // Faster to just run the bytecode.
// FIXME: We should be able to handle these cases (jump ahead while...)
2025-05-17 21:50:37 +02:00
if ( ! compares . char_classes . is_empty ( ) | | ! compares . negated_char_classes . is_empty ( ) | | ! compares . negated_ranges . is_empty ( ) )
2025-04-15 21:32:26 +02:00
return ;
2025-07-11 15:06:40 +02:00
for ( auto it = compares . ranges . begin ( ) ; it ! = compares . ranges . end ( ) ; + + it ) {
2025-04-15 21:32:26 +02:00
parser_result . optimization_data . starting_ranges . append ( { it . key ( ) , * it } ) ;
2026-02-15 15:08:52 +01:00
for ( auto range : Unicode : : expand_range_case_insensitive ( it . key ( ) , * it ) )
parser_result . optimization_data . starting_ranges_insensitive . append ( { range . from , range . to } ) ;
2025-07-11 15:06:40 +02:00
}
2026-02-15 15:08:52 +01:00
quick_sort ( parser_result . optimization_data . starting_ranges_insensitive , [ ] ( CharRange a , CharRange b ) { return a . from < b . from ; } ) ;
2025-04-15 21:32:26 +02:00
return ;
}
2025-12-22 14:00:19 +01:00
case OpCodeId : : CompareSimple : {
auto & compare = to < OpCode_CompareSimple > ( opcode ) ;
auto flat_compares = compare . flat_compares ( ) ;
StaticallyInterpretedCompares compares ;
if ( ! interpret_compares ( flat_compares , compares ) )
return ; // No idea, the bytecode is too complex.
if ( compares . has_any_unicode_property )
return ; // Faster to just run the bytecode.
// FIXME: We should be able to handle these cases (jump ahead while...)
if ( ! compares . char_classes . is_empty ( ) | | ! compares . negated_char_classes . is_empty ( ) | | ! compares . negated_ranges . is_empty ( ) )
return ;
for ( auto it = compares . ranges . begin ( ) ; it ! = compares . ranges . end ( ) ; + + it ) {
parser_result . optimization_data . starting_ranges . append ( { it . key ( ) , * it } ) ;
2026-02-15 15:08:52 +01:00
for ( auto range : Unicode : : expand_range_case_insensitive ( it . key ( ) , * it ) )
parser_result . optimization_data . starting_ranges_insensitive . append ( { range . from , range . to } ) ;
2025-12-22 14:00:19 +01:00
}
2026-02-15 15:08:52 +01:00
quick_sort ( parser_result . optimization_data . starting_ranges_insensitive , [ ] ( CharRange a , CharRange b ) { return a . from < b . from ; } ) ;
2025-12-22 14:00:19 +01:00
return ;
}
2025-04-15 21:32:26 +02:00
case OpCodeId : : CheckBegin :
parser_result . optimization_data . only_start_of_line = true ;
return ;
case OpCodeId : : Checkpoint :
case OpCodeId : : Save :
case OpCodeId : : ClearCaptureGroup :
case OpCodeId : : SaveLeftCaptureGroup :
// These do not 'match' anything, so look through them.
state . instruction_position + = opcode . size ( ) ;
continue ;
default :
return ;
}
}
}
2021-09-12 17:30:27 +04:30
template < typename Parser >
2021-12-15 08:52:07 +03:30
typename Regex < Parser > : : BasicBlockList Regex < Parser > : : split_basic_blocks ( ByteCode const & bytecode )
2021-09-12 17:30:27 +04:30
{
BasicBlockList block_boundaries ;
2026-01-05 16:14:03 +01:00
HashTable < size_t > block_starts ;
2021-09-12 17:30:27 +04:30
2021-12-21 18:08:15 +03:30
auto bytecode_size = bytecode . size ( ) ;
2026-01-05 16:14:03 +01:00
block_starts . set ( 0 ) ;
2025-04-15 15:31:08 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2021-09-12 17:30:27 +04:30
state . instruction_position = 0 ;
2025-12-16 00:27:18 +01:00
auto check_jump = [ & ] < template < typename > class T > ( auto const & opcode ) {
auto & op = static_cast < T < ByteCode > const & > ( opcode ) ;
2021-09-12 17:30:27 +04:30
ssize_t jump_offset = op . size ( ) + op . offset ( ) ;
2026-01-05 16:14:03 +01:00
ssize_t target = state . instruction_position + jump_offset ;
block_starts . set ( target ) ;
block_starts . set ( state . instruction_position + opcode . size ( ) ) ;
2021-09-12 17:30:27 +04:30
} ;
2026-01-05 16:14:03 +01:00
2021-09-12 17:30:27 +04:30
for ( ; ; ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
check_jump . template operator ( ) < OpCode_Jump > ( opcode ) ;
break ;
case OpCodeId : : JumpNonEmpty :
check_jump . template operator ( ) < OpCode_JumpNonEmpty > ( opcode ) ;
break ;
case OpCodeId : : ForkJump :
check_jump . template operator ( ) < OpCode_ForkJump > ( opcode ) ;
break ;
case OpCodeId : : ForkStay :
check_jump . template operator ( ) < OpCode_ForkStay > ( opcode ) ;
break ;
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
check_jump . template operator ( ) < OpCode_ForkIf > ( opcode ) ;
break ;
2021-09-12 17:30:27 +04:30
case OpCodeId : : FailForks :
2026-01-05 16:14:03 +01:00
block_starts . set ( state . instruction_position + opcode . size ( ) ) ;
2021-09-12 17:30:27 +04:30
break ;
case OpCodeId : : Repeat : {
2025-12-16 00:27:18 +01:00
auto & repeat = to < OpCode_Repeat > ( opcode ) ;
2024-12-23 12:16:08 +01:00
auto repeat_start = state . instruction_position - repeat . offset ( ) ;
2026-01-05 16:14:03 +01:00
block_starts . set ( repeat_start ) ;
block_starts . set ( state . instruction_position + opcode . size ( ) ) ;
2021-09-12 17:30:27 +04:30
break ;
}
default :
break ;
}
auto next_ip = state . instruction_position + opcode . size ( ) ;
2021-12-21 18:08:15 +03:30
if ( next_ip < bytecode_size )
2021-09-12 17:30:27 +04:30
state . instruction_position = next_ip ;
else
break ;
}
2026-01-05 16:14:03 +01:00
Vector < size_t > sorted_starts ;
for ( auto start : block_starts )
sorted_starts . append ( start ) ;
quick_sort ( sorted_starts ) ;
2021-09-12 17:30:27 +04:30
2026-01-05 16:14:03 +01:00
for ( size_t i = 0 ; i < sorted_starts . size ( ) ; + + i ) {
size_t start = sorted_starts [ i ] ;
size_t end ;
if ( i + 1 < sorted_starts . size ( ) ) {
size_t next_block_start = sorted_starts [ i + 1 ] ;
state . instruction_position = start ;
size_t last_ip = start ;
while ( state . instruction_position < next_block_start ) {
last_ip = state . instruction_position ;
auto & opcode = bytecode . get_opcode ( state ) ;
state . instruction_position + = opcode . size ( ) ;
}
end = last_ip ;
} else {
state . instruction_position = start ;
size_t last_ip = start ;
while ( state . instruction_position < bytecode_size ) {
last_ip = state . instruction_position ;
auto & opcode = bytecode . get_opcode ( state ) ;
auto next_ip = state . instruction_position + opcode . size ( ) ;
if ( next_ip > = bytecode_size )
break ;
state . instruction_position = next_ip ;
}
end = last_ip ;
}
block_boundaries . append ( { start , end , " Block " sv } ) ;
}
2021-09-12 17:30:27 +04:30
return block_boundaries ;
}
2022-07-04 21:11:15 +04:30
static bool has_overlap ( Vector < CompareTypeAndValuePair > const & lhs , Vector < CompareTypeAndValuePair > const & rhs )
{
// We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover).
bool inverse { false } ;
bool temporary_inverse { false } ;
bool reset_temporary_inverse { false } ;
auto current_lhs_inversion_state = [ & ] ( ) - > bool { return temporary_inverse ^ inverse ; } ;
2025-04-15 21:32:26 +02:00
StaticallyInterpretedCompares compares ;
2025-05-17 21:50:37 +02:00
auto & lhs_ranges = compares . ranges ;
auto & lhs_negated_ranges = compares . negated_ranges ;
auto & lhs_char_classes = compares . char_classes ;
auto & lhs_negated_char_classes = compares . negated_char_classes ;
2025-04-15 21:32:26 +02:00
auto & has_any_unicode_property = compares . has_any_unicode_property ;
2025-05-17 21:50:37 +02:00
auto & lhs_unicode_general_categories = compares . unicode_general_categories ;
auto & lhs_unicode_properties = compares . unicode_properties ;
auto & lhs_unicode_scripts = compares . unicode_scripts ;
auto & lhs_unicode_script_extensions = compares . unicode_script_extensions ;
auto & lhs_negated_unicode_general_categories = compares . negated_unicode_general_categories ;
auto & lhs_negated_unicode_properties = compares . negated_unicode_properties ;
auto & lhs_negated_unicode_scripts = compares . negated_unicode_scripts ;
auto & lhs_negated_unicode_script_extensions = compares . negated_unicode_script_extensions ;
2023-07-14 08:13:59 +03:30
auto any_unicode_property_matches = [ & ] ( u32 code_point ) {
if ( any_of ( lhs_negated_unicode_general_categories , [ code_point ] ( auto category ) { return Unicode : : code_point_has_general_category ( code_point , category ) ; } ) )
return false ;
if ( any_of ( lhs_negated_unicode_properties , [ code_point ] ( auto property ) { return Unicode : : code_point_has_property ( code_point , property ) ; } ) )
return false ;
if ( any_of ( lhs_negated_unicode_scripts , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script ( code_point , script ) ; } ) )
return false ;
if ( any_of ( lhs_negated_unicode_script_extensions , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script_extension ( code_point , script ) ; } ) )
return false ;
if ( any_of ( lhs_unicode_general_categories , [ code_point ] ( auto category ) { return Unicode : : code_point_has_general_category ( code_point , category ) ; } ) )
return true ;
if ( any_of ( lhs_unicode_properties , [ code_point ] ( auto property ) { return Unicode : : code_point_has_property ( code_point , property ) ; } ) )
return true ;
if ( any_of ( lhs_unicode_scripts , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script ( code_point , script ) ; } ) )
return true ;
if ( any_of ( lhs_unicode_script_extensions , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script_extension ( code_point , script ) ; } ) )
return true ;
return false ;
} ;
2022-07-04 21:11:15 +04:30
auto range_contains = [ & ] < typename T > ( T & value ) - > bool {
u32 start ;
u32 end ;
if constexpr ( IsSame < T , CharRange > ) {
start = value . from ;
end = value . to ;
} else {
start = value ;
end = value ;
}
2023-07-14 08:13:59 +03:30
if ( has_any_unicode_property ) {
// We have some properties, and a range is present
// Instead of checking every single code point in the range, assume it's a match.
return start ! = end | | any_unicode_property_matches ( start ) ;
}
2022-07-04 21:11:15 +04:30
auto * max = lhs_ranges . find_smallest_not_below ( start ) ;
return max & & * max < = end ;
} ;
auto char_class_contains = [ & ] ( CharClass const & value ) - > bool {
if ( lhs_char_classes . contains ( value ) )
return true ;
if ( lhs_negated_char_classes . contains ( value ) )
return false ;
2024-10-24 11:39:26 +02:00
if ( lhs_ranges . is_empty ( ) )
return false ;
for ( auto it = lhs_ranges . begin ( ) ; it ! = lhs_ranges . end ( ) ; + + it ) {
auto start = it . key ( ) ;
auto end = * it ;
for ( u32 ch = start ; ch < = end ; + + ch ) {
2026-02-15 15:08:52 +01:00
if ( OpCode_Compare < ByteCode > : : matches_character_class ( value , ch , false , false ) )
2024-10-24 11:39:26 +02:00
return true ;
}
}
return false ;
2022-07-04 21:11:15 +04:30
} ;
2025-04-15 21:32:26 +02:00
if ( ! interpret_compares ( lhs , compares ) )
return true ; // We can't interpret this, so we can't optimize it.
2022-07-04 21:11:15 +04:30
if constexpr ( REGEX_DEBUG ) {
dbgln ( " lhs ranges: " ) ;
for ( auto it = lhs_ranges . begin ( ) ; it ! = lhs_ranges . end ( ) ; + + it )
dbgln ( " {}..{} " , it . key ( ) , * it ) ;
dbgln ( " lhs negated ranges: " ) ;
for ( auto it = lhs_negated_ranges . begin ( ) ; it ! = lhs_negated_ranges . end ( ) ; + + it )
dbgln ( " {}..{} " , it . key ( ) , * it ) ;
}
2024-10-24 11:39:26 +02:00
temporary_inverse = false ;
reset_temporary_inverse = false ;
inverse = false ;
2025-11-01 16:47:57 +01:00
struct DisjunctionState {
bool in_or = false ; // We're in an OR block, so we should wait for the EndAndOr to decide if we would match.
bool matched_in_or = false ;
bool inverse_matched_in_or = false ;
} ;
Vector < DisjunctionState , 2 > disjunction_stack ;
disjunction_stack . empend ( ) ;
auto in_or = [ & ] - > bool & { return disjunction_stack . last ( ) . in_or ; } ;
auto matched_in_or = [ & ] - > bool & { return disjunction_stack . last ( ) . matched_in_or ; } ;
auto inverse_matched_in_or = [ & ] - > bool & { return disjunction_stack . last ( ) . inverse_matched_in_or ; } ;
2024-10-24 11:39:26 +02:00
2022-07-04 21:11:15 +04:30
for ( auto const & pair : rhs ) {
if ( reset_temporary_inverse ) {
reset_temporary_inverse = false ;
temporary_inverse = false ;
} else {
reset_temporary_inverse = true ;
}
2024-10-24 21:59:15 +02:00
if constexpr ( REGEX_DEBUG ) {
dbgln ( " check {} ({}) [inverted? {}] against {{ " , character_compare_type_name ( pair . type ) , pair . value , current_lhs_inversion_state ( ) ) ;
for ( auto it = lhs_ranges . begin ( ) ; it ! = lhs_ranges . end ( ) ; + + it )
dbgln ( " {}..{} " , it . key ( ) , * it ) ;
for ( auto it = lhs_negated_ranges . begin ( ) ; it ! = lhs_negated_ranges . end ( ) ; + + it )
dbgln ( " ^[{}..{}] " , it . key ( ) , * it ) ;
for ( auto & char_class : lhs_char_classes )
dbgln ( " {} " , character_class_name ( char_class ) ) ;
for ( auto & char_class : lhs_negated_char_classes )
dbgln ( " ^{} " , character_class_name ( char_class ) ) ;
2025-11-01 16:47:57 +01:00
dbgln ( " }}, in or: {}, matched in or: {}, inverse matched in or: {} " , in_or ( ) , matched_in_or ( ) , inverse_matched_in_or ( ) ) ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
switch ( pair . type ) {
case CharacterCompareType : : Inverse :
inverse = ! inverse ;
break ;
case CharacterCompareType : : TemporaryInverse :
2022-07-10 11:40:17 +04:30
temporary_inverse = true ;
2024-11-17 12:48:47 +01:00
reset_temporary_inverse = false ;
2022-07-04 21:11:15 +04:30
break ;
case CharacterCompareType : : AnyChar :
// Special case: if not inverted, AnyChar is always in the range.
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ! current_lhs_inversion_state ( ) )
2022-07-04 21:11:15 +04:30
return true ;
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) = true ;
inverse_matched_in_or ( ) = false ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
break ;
2024-10-24 21:59:15 +02:00
case CharacterCompareType : : Char : {
auto matched = range_contains ( pair . value ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( current_lhs_inversion_state ( ) ^ matched ) )
2022-07-04 21:11:15 +04:30
return true ;
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = matched ;
inverse_matched_in_or ( ) | = ! matched ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
break ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
case CharacterCompareType : : String :
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return true ;
2025-11-09 13:35:16 +01:00
case CharacterCompareType : : StringSet :
return true ;
2024-10-24 21:59:15 +02:00
case CharacterCompareType : : CharClass : {
auto contains = char_class_contains ( static_cast < CharClass > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( current_lhs_inversion_state ( ) ^ contains ) )
2022-07-04 21:11:15 +04:30
return true ;
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = contains ;
inverse_matched_in_or ( ) | = ! contains ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
break ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
case CharacterCompareType : : CharRange : {
2022-07-05 04:56:21 +04:30
auto range = CharRange ( pair . value ) ;
2024-10-24 21:59:15 +02:00
auto contains = range_contains ( range ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( contains ^ current_lhs_inversion_state ( ) ) )
2022-07-04 21:11:15 +04:30
return true ;
2024-10-24 21:59:15 +02:00
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = contains ;
inverse_matched_in_or ( ) | = ! contains ;
2024-10-24 21:59:15 +02:00
}
2022-07-04 21:11:15 +04:30
break ;
}
case CharacterCompareType : : LookupTable :
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return true ;
case CharacterCompareType : : Reference :
2025-07-23 20:48:34 +02:00
case CharacterCompareType : : NamedReference :
2022-07-04 21:11:15 +04:30
// We've handled this before coming here.
break ;
case CharacterCompareType : : Property :
2023-07-14 08:13:59 +03:30
// The only reasonable scenario where we can check these properties without spending too much time is if:
// - the ranges are empty
// - the char classes are empty
// - the unicode properties are empty or contain only this property
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_properties . is_empty ( ) & & ! lhs_negated_unicode_properties . is_empty ( ) ) {
2024-10-24 21:59:15 +02:00
auto contains = lhs_unicode_properties . contains ( static_cast < Unicode : : Property > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2024-10-24 21:59:15 +02:00
auto inverse_contains = lhs_negated_unicode_properties . contains ( static_cast < Unicode : : Property > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2024-10-24 21:59:15 +02:00
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = contains ;
inverse_matched_in_or ( ) | = inverse_contains ;
2024-10-24 21:59:15 +02:00
}
2023-07-14 08:13:59 +03:30
}
break ;
2022-07-04 21:11:15 +04:30
case CharacterCompareType : : GeneralCategory :
2023-07-14 08:13:59 +03:30
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_general_categories . is_empty ( ) & & ! lhs_negated_unicode_general_categories . is_empty ( ) ) {
2024-10-24 21:59:15 +02:00
auto contains = lhs_unicode_general_categories . contains ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2024-10-24 21:59:15 +02:00
auto inverse_contains = lhs_negated_unicode_general_categories . contains ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = contains ;
inverse_matched_in_or ( ) | = inverse_contains ;
2024-10-24 21:59:15 +02:00
}
2023-07-14 08:13:59 +03:30
}
break ;
2022-07-04 21:11:15 +04:30
case CharacterCompareType : : Script :
2023-07-14 08:13:59 +03:30
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_scripts . is_empty ( ) & & ! lhs_negated_unicode_scripts . is_empty ( ) ) {
2024-10-24 21:59:15 +02:00
auto contains = lhs_unicode_scripts . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2024-10-24 21:59:15 +02:00
auto inverse_contains = lhs_negated_unicode_scripts . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = contains ;
inverse_matched_in_or ( ) | = inverse_contains ;
2024-10-24 21:59:15 +02:00
}
2023-07-14 08:13:59 +03:30
}
break ;
2022-07-04 21:11:15 +04:30
case CharacterCompareType : : ScriptExtension :
2023-07-14 08:13:59 +03:30
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_script_extensions . is_empty ( ) & & ! lhs_negated_unicode_script_extensions . is_empty ( ) ) {
2024-10-24 21:59:15 +02:00
auto contains = lhs_unicode_script_extensions . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2024-10-24 21:59:15 +02:00
auto inverse_contains = lhs_negated_unicode_script_extensions . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
2025-11-01 16:47:57 +01:00
if ( ! in_or ( ) & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 08:13:59 +03:30
return true ;
2025-11-01 16:47:57 +01:00
if ( in_or ( ) ) {
matched_in_or ( ) | = contains ;
inverse_matched_in_or ( ) | = inverse_contains ;
2024-10-24 21:59:15 +02:00
}
2023-07-14 08:13:59 +03:30
}
break ;
2022-07-20 23:22:07 +04:30
case CharacterCompareType : : Or :
2025-11-01 16:47:57 +01:00
disjunction_stack . empend ( true ) ;
2024-10-24 21:59:15 +02:00
break ;
2025-11-01 16:47:57 +01:00
case CharacterCompareType : : EndAndOr : {
2024-10-24 21:59:15 +02:00
// FIXME: Handle And when we support it below.
2025-11-01 16:47:57 +01:00
VERIFY ( in_or ( ) ) ;
auto state = disjunction_stack . take_last ( ) ;
2024-10-24 21:59:15 +02:00
if ( current_lhs_inversion_state ( ) ) {
2025-11-01 16:47:57 +01:00
if ( ! state . inverse_matched_in_or )
2024-10-24 21:59:15 +02:00
return true ;
} else {
2025-11-01 16:47:57 +01:00
if ( state . matched_in_or )
2024-10-24 21:59:15 +02:00
return true ;
}
break ;
2025-11-01 16:47:57 +01:00
}
2024-10-24 21:59:15 +02:00
case CharacterCompareType : : And :
2025-11-09 13:35:16 +01:00
case CharacterCompareType : : Subtract :
2022-07-04 21:11:15 +04:30
// FIXME: These are too difficult to handle, so bail out.
return true ;
case CharacterCompareType : : Undefined :
case CharacterCompareType : : RangeExpressionDummy :
// These do not occur in valid bytecode.
VERIFY_NOT_REACHED ( ) ;
}
}
2025-02-01 07:59:29 +01:00
// We got to the end, just double-check that the inverse flag was not left on (which would match everything).
return current_lhs_inversion_state ( ) ;
2022-07-04 21:11:15 +04:30
}
2025-05-17 21:50:37 +02:00
static bool has_overlap ( StaticallyInterpretedCompares const & lhs , StaticallyInterpretedCompares const & rhs )
{
if ( lhs . has_any_unicode_property | | rhs . has_any_unicode_property | | ! lhs . negated_ranges . is_empty ( ) | | ! rhs . negated_ranges . is_empty ( ) | | ! lhs . negated_char_classes . is_empty ( ) | | ! rhs . negated_char_classes . is_empty ( ) )
return true ;
for ( auto it_lhs = lhs . ranges . begin ( ) ; it_lhs ! = lhs . ranges . end ( ) ; + + it_lhs ) {
auto lhs_start = it_lhs . key ( ) ;
auto lhs_end = * it_lhs ;
for ( auto it_rhs = rhs . ranges . begin ( ) ; it_rhs ! = rhs . ranges . end ( ) ; + + it_rhs ) {
auto rhs_start = it_rhs . key ( ) ;
auto rhs_end = * it_rhs ;
// Check if ranges overlap
if ( lhs_start < = rhs_end & & rhs_start < = lhs_end ) {
return true ;
}
}
}
for ( auto & lhs_class : lhs . char_classes ) {
for ( auto & rhs_class : rhs . char_classes ) {
if ( lhs_class = = rhs_class )
return true ;
}
}
return false ;
}
2021-11-18 08:00:29 +03:30
enum class AtomicRewritePreconditionResult {
SatisfiedWithProperHeader ,
SatisfiedWithEmptyHeader ,
NotSatisfied ,
} ;
2025-03-09 02:37:02 +01:00
static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition ( ByteCode const & bytecode , Block repeated_block , Block following_block , auto const & all_blocks )
2021-09-12 17:30:27 +04:30
{
Vector < Vector < CompareTypeAndValuePair > > repeated_values ;
2025-04-15 15:31:08 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2023-02-15 10:14:13 +03:30
auto has_seen_actionable_opcode = false ;
2021-09-12 17:30:27 +04:30
for ( state . instruction_position = repeated_block . start ; state . instruction_position < repeated_block . end ; ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Compare : {
2023-02-15 10:14:13 +03:30
has_seen_actionable_opcode = true ;
2025-12-16 00:27:18 +01:00
auto compares = to < OpCode_Compare > ( opcode ) . flat_compares ( ) ;
2021-09-12 17:30:27 +04:30
if ( repeated_values . is_empty ( ) & & any_of ( compares , [ ] ( auto & compare ) { return compare . type = = CharacterCompareType : : AnyChar ; } ) )
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 17:30:27 +04:30
repeated_values . append ( move ( compares ) ) ;
break ;
}
case OpCodeId : : CheckBegin :
case OpCodeId : : CheckEnd :
2023-02-15 10:14:13 +03:30
has_seen_actionable_opcode = true ;
2021-09-12 17:30:27 +04:30
if ( repeated_values . is_empty ( ) )
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ;
2021-09-12 17:30:27 +04:30
break ;
case OpCodeId : : CheckBoundary :
// FIXME: What should we do with these? for now, let's fail.
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 17:30:27 +04:30
case OpCodeId : : Restore :
case OpCodeId : : GoBack :
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : NotSatisfied ;
2023-02-15 10:14:13 +03:30
case OpCodeId : : ForkJump :
case OpCodeId : : ForkReplaceJump :
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
2023-02-15 10:14:13 +03:30
case OpCodeId : : JumpNonEmpty :
// We could attempt to recursively resolve the follow set, but pretending that this just goes nowhere is faster.
if ( ! has_seen_actionable_opcode )
return AtomicRewritePreconditionResult : : NotSatisfied ;
break ;
2025-03-09 02:37:02 +01:00
case OpCodeId : : Jump : {
// Just follow the jump, it's unconditional.
2025-12-16 00:27:18 +01:00
auto & jump = to < OpCode_Jump > ( opcode ) ;
2025-03-09 02:37:02 +01:00
auto jump_target = state . instruction_position + jump . offset ( ) + jump . size ( ) ;
// Find the block that this jump leads to.
auto next_block_it = find_if ( all_blocks . begin ( ) , all_blocks . end ( ) , [ jump_target ] ( auto & block ) { return block . start = = jump_target ; } ) ;
if ( next_block_it = = all_blocks . end ( ) )
return AtomicRewritePreconditionResult : : NotSatisfied ;
repeated_block = * next_block_it ;
state . instruction_position = repeated_block . start ;
continue ;
}
2021-09-12 17:30:27 +04:30
default :
break ;
}
state . instruction_position + = opcode . size ( ) ;
}
dbgln_if ( REGEX_DEBUG , " Found {} entries in reference " , repeated_values . size ( ) ) ;
2025-03-09 02:37:02 +01:00
auto accept_empty_follow = false ;
while ( following_block . start = = following_block . end & & ! accept_empty_follow ) {
dbgln_if ( REGEX_DEBUG , " Following empty block {} " , following_block . start ) ;
// If the following block has a single instruction, it must be some kind of jump.
// Unless it's an unconditional jump, we can't rewrite it - so bail out.
state . instruction_position = following_block . start ;
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump : {
// Just follow the jump, it's unconditional.
2025-12-16 00:27:18 +01:00
auto & jump = to < OpCode_Jump > ( opcode ) ;
2025-03-09 02:37:02 +01:00
auto jump_target = state . instruction_position + jump . offset ( ) + jump . size ( ) ;
if ( jump_target < state . instruction_position ) {
dbgln_if ( REGEX_DEBUG , " Jump to {} is backwards, I'm scared of loops " , jump_target ) ;
return AtomicRewritePreconditionResult : : NotSatisfied ;
}
dbgln_if ( REGEX_DEBUG , " Following jump to {} " , jump_target ) ;
// Find the block that this jump leads to.
auto next_block_it = find_if ( all_blocks . begin ( ) , all_blocks . end ( ) , [ jump_target ] ( auto & block ) { return block . start = = jump_target ; } ) ;
if ( next_block_it = = all_blocks . end ( ) )
return AtomicRewritePreconditionResult : : NotSatisfied ;
following_block = * next_block_it ;
state . instruction_position = repeated_block . start ;
continue ;
}
case OpCodeId : : ForkJump :
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
2025-03-09 02:37:02 +01:00
case OpCodeId : : ForkReplaceJump :
case OpCodeId : : JumpNonEmpty :
return AtomicRewritePreconditionResult : : NotSatisfied ;
default :
// No interesting effect here.
dbgln_if ( REGEX_DEBUG , " Empty follow had instruction {} " , opcode . to_byte_string ( ) ) ;
accept_empty_follow = true ;
break ;
}
}
2021-11-18 08:00:29 +03:30
bool following_block_has_at_least_one_compare = false ;
2021-09-12 17:30:27 +04:30
// Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
2023-06-14 01:02:40 +03:30
auto final_instruction = following_block . start ;
2021-09-12 17:30:27 +04:30
for ( state . instruction_position = following_block . start ; state . instruction_position < following_block . end ; ) {
2023-06-14 01:02:40 +03:30
final_instruction = state . instruction_position ;
2021-09-12 17:30:27 +04:30
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Compare : {
2021-11-18 08:00:29 +03:30
following_block_has_at_least_one_compare = true ;
2021-09-12 17:30:27 +04:30
// We found a compare, let's see what it has.
2025-12-16 00:27:18 +01:00
auto compares = to < OpCode_Compare > ( opcode ) . flat_compares ( ) ;
2021-09-12 17:30:27 +04:30
if ( compares . is_empty ( ) )
break ;
2021-09-15 14:31:55 +04:30
if ( any_of ( compares , [ & ] ( auto & compare ) {
2025-07-23 20:48:34 +02:00
return compare . type = = CharacterCompareType : : AnyChar | | compare . type = = CharacterCompareType : : Reference | | compare . type = = CharacterCompareType : : NamedReference ;
2021-09-15 14:31:55 +04:30
} ) )
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 17:30:27 +04:30
2022-07-04 21:11:15 +04:30
if ( any_of ( repeated_values , [ & ] ( auto & repeated_value ) { return has_overlap ( compares , repeated_value ) ; } ) )
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 17:30:27 +04:30
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ;
2021-09-12 17:30:27 +04:30
}
case OpCodeId : : CheckBegin :
case OpCodeId : : CheckEnd :
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ; // Nothing can match the end!
2021-09-12 17:30:27 +04:30
case OpCodeId : : CheckBoundary :
// FIXME: What should we do with these? For now, consider them a failure.
2021-11-18 08:00:29 +03:30
return AtomicRewritePreconditionResult : : NotSatisfied ;
2023-02-15 10:14:13 +03:30
case OpCodeId : : ForkJump :
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
2023-02-15 10:14:13 +03:30
case OpCodeId : : ForkReplaceJump :
case OpCodeId : : JumpNonEmpty :
// See note in the previous switch, same cases.
if ( ! following_block_has_at_least_one_compare )
return AtomicRewritePreconditionResult : : NotSatisfied ;
break ;
2021-09-12 17:30:27 +04:30
default :
break ;
}
state . instruction_position + = opcode . size ( ) ;
}
2023-06-14 01:02:40 +03:30
// If the following block falls through, we can't rewrite it.
state . instruction_position = final_instruction ;
switch ( bytecode . get_opcode ( state ) . opcode_id ( ) ) {
case OpCodeId : : Jump :
case OpCodeId : : JumpNonEmpty :
case OpCodeId : : ForkJump :
case OpCodeId : : ForkReplaceJump :
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
2023-06-14 01:02:40 +03:30
break ;
default :
return AtomicRewritePreconditionResult : : NotSatisfied ;
}
2021-11-18 08:00:29 +03:30
if ( following_block_has_at_least_one_compare )
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ;
return AtomicRewritePreconditionResult : : SatisfiedWithEmptyHeader ;
2021-09-12 17:30:27 +04:30
}
2023-07-28 20:59:33 +03:30
template < typename Parser >
bool Regex < Parser > : : attempt_rewrite_entire_match_as_substring_search ( BasicBlockList const & basic_blocks )
{
// If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }).
if ( basic_blocks . size ( ) > 1 )
return false ;
if ( basic_blocks . is_empty ( ) ) {
2025-12-16 01:42:17 +01:00
parser_result . optimization_data . pure_substring_search . emplace ( ) ;
2023-07-28 20:59:33 +03:30
return true ; // Empty regex, sure.
}
2025-12-16 00:27:18 +01:00
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
2023-07-28 20:59:33 +03:30
// We have a single basic block, let's see if it's a series of character or string compares.
2025-12-16 01:42:17 +01:00
Vector < u16 > u16_units ;
2025-04-15 15:31:08 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2023-07-28 20:59:33 +03:30
while ( state . instruction_position < bytecode . size ( ) ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Compare : {
2025-12-16 00:27:18 +01:00
auto & compare = to < OpCode_Compare > ( opcode ) ;
2025-12-16 01:42:17 +01:00
if ( compare . arguments_count ( ) = = 0 )
return false ; // This matches 'nothing', so we can't do a substring search.
2023-07-28 20:59:33 +03:30
for ( auto & flat_compare : compare . flat_compares ( ) ) {
if ( flat_compare . type ! = CharacterCompareType : : Char )
return false ;
2025-12-16 01:42:17 +01:00
( void ) AK : : UnicodeUtils : : code_point_to_utf16 ( flat_compare . value , [ & ] ( auto code_unit ) { u16_units . append ( code_unit ) ; } ) ;
2023-07-28 20:59:33 +03:30
}
break ;
}
default :
return false ;
}
state . instruction_position + = opcode . size ( ) ;
}
2025-12-16 01:42:17 +01:00
parser_result . optimization_data . pure_substring_search . emplace ( move ( u16_units ) ) ;
2023-07-28 20:59:33 +03:30
return true ;
}
2025-04-23 21:13:30 +02:00
template < class Parser >
void Regex < Parser > : : rewrite_with_useless_jumps_removed ( )
{
2025-12-16 00:27:18 +01:00
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
2025-04-23 21:13:30 +02:00
if constexpr ( REGEX_DEBUG ) {
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2025-04-23 21:13:30 +02:00
dbg . print_bytecode ( * this ) ;
}
2026-01-06 18:10:48 +01:00
BytecodeRewriter rewriter ( bytecode , pattern_value ) ;
2025-04-23 21:13:30 +02:00
2025-12-16 00:27:18 +01:00
auto state = MatchState : : only_for_enumeration ( ) ;
for ( auto & instr : rewriter . instructions ) {
state . instruction_position = instr . old_ip ;
2025-04-23 21:13:30 +02:00
auto & op = bytecode . get_opcode ( state ) ;
bool is_useless = false ;
if ( op . opcode_id ( ) = = OpCodeId : : Jump ) {
2025-12-16 00:27:18 +01:00
is_useless = to < OpCode_Jump > ( op ) . offset ( ) = = 0 ;
2025-04-23 21:13:30 +02:00
} else if ( op . opcode_id ( ) = = OpCodeId : : JumpNonEmpty ) {
2025-12-16 00:27:18 +01:00
is_useless = to < OpCode_JumpNonEmpty > ( op ) . offset ( ) = = 0 ;
2025-04-23 21:13:30 +02:00
} else if ( op . opcode_id ( ) = = OpCodeId : : ForkJump | | op . opcode_id ( ) = = OpCodeId : : ForkReplaceJump ) {
2025-12-16 00:27:18 +01:00
is_useless = to < OpCode_ForkJump > ( op ) . offset ( ) = = 0 ;
2025-04-23 21:13:30 +02:00
} else if ( op . opcode_id ( ) = = OpCodeId : : ForkStay | | op . opcode_id ( ) = = OpCodeId : : ForkReplaceStay ) {
2025-12-16 00:27:18 +01:00
is_useless = to < OpCode_ForkStay > ( op ) . offset ( ) = = 0 ;
2025-12-31 07:50:52 +01:00
} else if ( op . opcode_id ( ) = = OpCodeId : : ForkIf ) {
is_useless = to < OpCode_ForkIf > ( op ) . offset ( ) = = 0 ;
2025-04-23 21:13:30 +02:00
}
2025-12-16 00:27:18 +01:00
instr . skip = is_useless ;
2025-04-23 21:13:30 +02:00
}
2025-12-16 00:27:18 +01:00
parser_result . bytecode = rewriter . rebuild ( bytecode ) ;
2025-04-23 21:13:30 +02:00
}
2021-09-12 17:30:27 +04:30
template < typename Parser >
void Regex < Parser > : : attempt_rewrite_loops_as_atomic_groups ( BasicBlockList const & basic_blocks )
{
2025-12-16 00:27:18 +01:00
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
2021-09-12 17:30:27 +04:30
if constexpr ( REGEX_DEBUG ) {
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2021-09-12 17:30:27 +04:30
dbg . print_bytecode ( * this ) ;
2021-12-21 17:38:51 +01:00
for ( auto const & block : basic_blocks )
2024-12-11 10:55:38 +01:00
dbgln ( " block from {} to {} (comment: {}) " , block . start , block . end , block . comment ) ;
2021-09-12 17:30:27 +04:30
}
// A pattern such as:
// bb0 | RE0
// | ForkX bb0
// -------------------------
// bb1 | RE1
// can be rewritten as:
// -------------------------
// bb0 | RE0
// | ForkReplaceX bb0
// -------------------------
// bb1 | RE1
// provided that first(RE1) not-in end(RE0), which is to say
// that RE1 cannot start with whatever RE0 has matched (ever).
//
// Alternatively, a second form of this pattern can also occur:
// bb0 | *
// | ForkX bb2
// ------------------------
// bb1 | RE0
// | Jump bb0
// ------------------------
// bb2 | RE1
// which can be transformed (with the same preconditions) to:
// bb0 | *
// | ForkReplaceX bb2
// ------------------------
// bb1 | RE0
// | Jump bb0
// ------------------------
// bb2 | RE1
enum class AlternateForm {
2021-11-18 08:00:29 +03:30
DirectLoopWithoutHeader , // loop without proper header, a block forking to itself. i.e. the first form.
DirectLoopWithoutHeaderAndEmptyFollow , // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty.
DirectLoopWithHeader , // loop with proper header, i.e. the second form.
2021-09-12 17:30:27 +04:30
} ;
struct CandidateBlock {
Block forking_block ;
Optional < Block > new_target_block ;
AlternateForm form ;
} ;
Vector < CandidateBlock > candidate_blocks ;
2025-06-24 16:41:28 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2021-09-12 17:30:27 +04:30
2025-12-16 00:27:18 +01:00
auto is_an_eligible_jump = [ & state ] ( auto & opcode , size_t ip , size_t block_start , AlternateForm alternate_form ) {
2025-06-24 16:41:28 +02:00
opcode . set_state ( state ) ;
2021-09-12 17:30:27 +04:30
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : JumpNonEmpty : {
2025-12-16 00:27:18 +01:00
auto const & op = to < OpCode_JumpNonEmpty > ( opcode ) ;
2021-09-12 17:30:27 +04:30
auto form = op . form ( ) ;
if ( form ! = OpCodeId : : Jump & & alternate_form = = AlternateForm : : DirectLoopWithHeader )
return false ;
if ( form ! = OpCodeId : : ForkJump & & form ! = OpCodeId : : ForkStay & & alternate_form = = AlternateForm : : DirectLoopWithoutHeader )
return false ;
return op . offset ( ) + ip + opcode . size ( ) = = block_start ;
}
case OpCodeId : : ForkJump :
if ( alternate_form = = AlternateForm : : DirectLoopWithHeader )
return false ;
2025-12-16 00:27:18 +01:00
return to < OpCode_ForkJump > ( opcode ) . offset ( ) + ip + opcode . size ( ) = = block_start ;
2021-09-12 17:30:27 +04:30
case OpCodeId : : ForkStay :
if ( alternate_form = = AlternateForm : : DirectLoopWithHeader )
return false ;
2025-12-16 00:27:18 +01:00
return to < OpCode_ForkStay > ( opcode ) . offset ( ) + ip + opcode . size ( ) = = block_start ;
2021-09-12 17:30:27 +04:30
case OpCodeId : : Jump :
// Infinite loop does *not* produce forks.
if ( alternate_form = = AlternateForm : : DirectLoopWithoutHeader )
return false ;
if ( alternate_form = = AlternateForm : : DirectLoopWithHeader )
2025-12-16 00:27:18 +01:00
return to < OpCode_Jump > ( opcode ) . offset ( ) + ip + opcode . size ( ) = = block_start ;
2021-09-12 17:30:27 +04:30
VERIFY_NOT_REACHED ( ) ;
default :
return false ;
}
} ;
for ( size_t i = 0 ; i < basic_blocks . size ( ) ; + + i ) {
auto forking_block = basic_blocks [ i ] ;
Optional < Block > fork_fallback_block ;
if ( i + 1 < basic_blocks . size ( ) )
fork_fallback_block = basic_blocks [ i + 1 ] ;
// Check if the last instruction in this block is a jump to the block itself:
{
state . instruction_position = forking_block . end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( is_an_eligible_jump ( opcode , state . instruction_position , forking_block . start , AlternateForm : : DirectLoopWithoutHeader ) ) {
// We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies.
// if RE1 is empty, there's no first(RE1), so this is an automatic pass.
2023-02-15 10:14:13 +03:30
if ( ! fork_fallback_block . has_value ( )
2025-03-09 02:37:02 +01:00
| | ( fork_fallback_block - > end = = fork_fallback_block - > start & & block_satisfies_atomic_rewrite_precondition ( bytecode , forking_block , * fork_fallback_block , basic_blocks ) ! = AtomicRewritePreconditionResult : : NotSatisfied ) ) {
2021-09-12 17:30:27 +04:30
candidate_blocks . append ( { forking_block , fork_fallback_block , AlternateForm : : DirectLoopWithoutHeader } ) ;
break ;
}
2025-03-09 02:37:02 +01:00
auto precondition = block_satisfies_atomic_rewrite_precondition ( bytecode , forking_block , * fork_fallback_block , basic_blocks ) ;
2021-11-18 08:00:29 +03:30
if ( precondition = = AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ) {
2021-09-12 17:30:27 +04:30
candidate_blocks . append ( { forking_block , fork_fallback_block , AlternateForm : : DirectLoopWithoutHeader } ) ;
break ;
}
2021-11-18 08:00:29 +03:30
if ( precondition = = AtomicRewritePreconditionResult : : SatisfiedWithEmptyHeader ) {
candidate_blocks . append ( { forking_block , fork_fallback_block , AlternateForm : : DirectLoopWithoutHeaderAndEmptyFollow } ) ;
break ;
}
2021-09-12 17:30:27 +04:30
}
}
// Check if the last instruction in the last block is a direct jump to this block
if ( fork_fallback_block . has_value ( ) ) {
state . instruction_position = fork_fallback_block - > end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( is_an_eligible_jump ( opcode , state . instruction_position , forking_block . start , AlternateForm : : DirectLoopWithHeader ) ) {
// We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
state . instruction_position = forking_block . end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( opcode . opcode_id ( ) = = OpCodeId : : ForkJump | | opcode . opcode_id ( ) = = OpCodeId : : ForkStay ) {
Optional < Block > block_following_fork_fallback ;
if ( i + 2 < basic_blocks . size ( ) )
block_following_fork_fallback = basic_blocks [ i + 2 ] ;
2021-11-18 08:00:29 +03:30
if ( ! block_following_fork_fallback . has_value ( )
2025-03-09 02:37:02 +01:00
| | block_satisfies_atomic_rewrite_precondition ( bytecode , * fork_fallback_block , * block_following_fork_fallback , basic_blocks ) ! = AtomicRewritePreconditionResult : : NotSatisfied ) {
2021-09-12 17:30:27 +04:30
candidate_blocks . append ( { forking_block , { } , AlternateForm : : DirectLoopWithHeader } ) ;
break ;
}
}
}
2024-10-24 21:59:15 +02:00
// We've found a slightly degenerate case, where the next block jumps back to the _jump_ instruction in the forking block.
// This is a direct loop without a proper header that is posing as a loop with a header.
if ( is_an_eligible_jump ( opcode , state . instruction_position , forking_block . end , AlternateForm : : DirectLoopWithHeader ) ) {
// We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
state . instruction_position = forking_block . end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( opcode . opcode_id ( ) = = OpCodeId : : ForkJump | | opcode . opcode_id ( ) = = OpCodeId : : ForkStay ) {
Optional < Block > block_following_fork_fallback ;
if ( i + 2 < basic_blocks . size ( ) )
block_following_fork_fallback = basic_blocks [ i + 2 ] ;
if ( ! block_following_fork_fallback . has_value ( )
2025-03-09 02:37:02 +01:00
| | block_satisfies_atomic_rewrite_precondition ( bytecode , * fork_fallback_block , * block_following_fork_fallback , basic_blocks ) ! = AtomicRewritePreconditionResult : : NotSatisfied ) {
2024-10-24 21:59:15 +02:00
candidate_blocks . append ( { forking_block , { } , AlternateForm : : DirectLoopWithoutHeader } ) ;
break ;
}
}
}
2021-09-12 17:30:27 +04:30
}
}
dbgln_if ( REGEX_DEBUG , " Found {} candidate blocks " , candidate_blocks . size ( ) ) ;
2025-03-09 02:37:02 +01:00
if constexpr ( REGEX_DEBUG ) {
for ( auto const & candidate : candidate_blocks ) {
dbgln ( " Candidate block from {} to {} (comment: {}) " , candidate . forking_block . start , candidate . forking_block . end , candidate . forking_block . comment ) ;
if ( candidate . new_target_block . has_value ( ) )
dbgln ( " with target block from {} to {} (comment: {}) " , candidate . new_target_block - > start , candidate . new_target_block - > end , candidate . new_target_block - > comment ) ;
switch ( candidate . form ) {
case AlternateForm : : DirectLoopWithoutHeader :
dbgln ( " form: DirectLoopWithoutHeader " ) ;
break ;
case AlternateForm : : DirectLoopWithoutHeaderAndEmptyFollow :
dbgln ( " form: DirectLoopWithoutHeaderAndEmptyFollow " ) ;
break ;
case AlternateForm : : DirectLoopWithHeader :
dbgln ( " form: DirectLoopWithHeader " ) ;
break ;
default :
dbgln ( " form: Unknown " ) ;
break ;
}
}
}
2021-09-12 17:30:27 +04:30
if ( candidate_blocks . is_empty ( ) ) {
dbgln_if ( REGEX_DEBUG , " Failed to find anything for {} " , pattern_value ) ;
return ;
}
RedBlackTree < size_t , size_t > needed_patches ;
// Reverse the blocks, so we can patch the bytecode without messing with the latter patches.
quick_sort ( candidate_blocks , [ ] ( auto & a , auto & b ) { return b . forking_block . start > a . forking_block . start ; } ) ;
for ( auto & candidate : candidate_blocks ) {
// Note that both forms share a ForkReplace patch in forking_block.
// Patch the ForkX in forking_block to be a ForkReplaceX instead.
auto & opcode_id = bytecode [ candidate . forking_block . end ] ;
if ( opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkStay ) {
opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceStay ;
} else if ( opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkJump ) {
opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceJump ;
} else if ( opcode_id = = ( ByteCodeValueType ) OpCodeId : : JumpNonEmpty ) {
auto & jump_opcode_id = bytecode [ candidate . forking_block . end + 3 ] ;
if ( jump_opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkStay )
jump_opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceStay ;
else if ( jump_opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkJump )
jump_opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceJump ;
else
VERIFY_NOT_REACHED ( ) ;
} else {
VERIFY_NOT_REACHED ( ) ;
}
}
if ( ! needed_patches . is_empty ( ) ) {
2025-04-15 15:31:08 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2021-12-21 18:08:15 +03:30
auto bytecode_size = bytecode . size ( ) ;
2021-09-12 17:30:27 +04:30
state . instruction_position = 0 ;
struct Patch {
ssize_t value ;
size_t offset ;
bool should_negate { false } ;
} ;
for ( ; ; ) {
2021-12-21 18:08:15 +03:30
if ( state . instruction_position > = bytecode_size )
2021-09-12 17:30:27 +04:30
break ;
auto & opcode = bytecode . get_opcode ( state ) ;
Stack < Patch , 2 > patch_points ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
2025-12-16 00:27:18 +01:00
patch_points . push ( { to < OpCode_Jump > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
2021-09-12 17:30:27 +04:30
break ;
case OpCodeId : : JumpNonEmpty :
2025-12-16 00:27:18 +01:00
patch_points . push ( { to < OpCode_JumpNonEmpty > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
patch_points . push ( { to < OpCode_JumpNonEmpty > ( opcode ) . checkpoint ( ) , state . instruction_position + 2 } ) ;
2021-09-12 17:30:27 +04:30
break ;
case OpCodeId : : ForkJump :
2025-12-16 00:27:18 +01:00
patch_points . push ( { to < OpCode_ForkJump > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
2021-09-12 17:30:27 +04:30
break ;
case OpCodeId : : ForkStay :
2025-12-16 00:27:18 +01:00
patch_points . push ( { to < OpCode_ForkStay > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
2021-09-12 17:30:27 +04:30
break ;
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
patch_points . push ( { to < OpCode_ForkIf > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
break ;
2021-09-12 17:30:27 +04:30
case OpCodeId : : Repeat :
2025-12-16 00:27:18 +01:00
patch_points . push ( { - ( ssize_t ) to < OpCode_Repeat > ( opcode ) . offset ( ) , state . instruction_position + 1 , true } ) ;
2021-09-12 17:30:27 +04:30
break ;
default :
break ;
}
while ( ! patch_points . is_empty ( ) ) {
auto & patch_point = patch_points . top ( ) ;
auto target_offset = patch_point . value + state . instruction_position + opcode . size ( ) ;
constexpr auto do_patch = [ ] ( auto & patch_it , auto & patch_point , auto & target_offset , auto & bytecode , auto ip ) {
if ( patch_it . key ( ) = = ip )
return ;
2022-01-21 12:33:49 +03:30
if ( patch_point . value < 0 & & target_offset < = patch_it . key ( ) & & ip > patch_it . key ( ) )
2021-09-12 17:30:27 +04:30
bytecode [ patch_point . offset ] + = ( patch_point . should_negate ? 1 : - 1 ) * ( * patch_it ) ;
2022-01-21 12:33:49 +03:30
else if ( patch_point . value > 0 & & target_offset > = patch_it . key ( ) & & ip < patch_it . key ( ) )
2021-09-12 17:30:27 +04:30
bytecode [ patch_point . offset ] + = ( patch_point . should_negate ? - 1 : 1 ) * ( * patch_it ) ;
} ;
if ( auto patch_it = needed_patches . find_largest_not_above_iterator ( target_offset ) ; ! patch_it . is_end ( ) )
do_patch ( patch_it , patch_point , target_offset , bytecode , state . instruction_position ) ;
else if ( auto patch_it = needed_patches . find_largest_not_above_iterator ( state . instruction_position ) ; ! patch_it . is_end ( ) )
do_patch ( patch_it , patch_point , target_offset , bytecode , state . instruction_position ) ;
patch_points . pop ( ) ;
}
state . instruction_position + = opcode . size ( ) ;
}
}
if constexpr ( REGEX_DEBUG ) {
warnln ( " Transformed to: " ) ;
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2021-09-12 17:30:27 +04:30
dbg . print_bytecode ( * this ) ;
}
}
2025-12-16 00:33:28 +01:00
template < typename Parser >
void Regex < Parser > : : attempt_rewrite_adjacent_compares_as_string_compare ( BasicBlockList const & basic_blocks )
{
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
if ( basic_blocks . is_empty ( ) )
return ;
// Find sequences of single-character compares
struct StringSequence {
size_t start_ip ;
size_t end_ip ;
Vector < u32 > characters ;
} ;
Vector < StringSequence > sequences ;
for ( auto const & block : basic_blocks ) {
auto state = MatchState : : only_for_enumeration ( ) ;
Vector < u32 > current_chars ;
size_t sequence_start = 0 ;
bool in_sequence = false ;
for ( state . instruction_position = block . start ; state . instruction_position < = block . end ; ) {
auto current_ip = state . instruction_position ;
auto & opcode = bytecode . get_opcode ( state ) ;
bool is_single_char = false ;
u32 character = 0 ;
if ( opcode . opcode_id ( ) = = OpCodeId : : Compare ) {
auto & compare = to < OpCode_Compare > ( opcode ) ;
auto flat_compares = compare . flat_compares ( ) ;
if ( flat_compares . size ( ) = = 1
& & flat_compares [ 0 ] . type = = CharacterCompareType : : Char ) {
is_single_char = true ;
character = flat_compares [ 0 ] . value ;
}
}
if ( is_single_char ) {
if ( ! in_sequence ) {
sequence_start = current_ip ;
current_chars . clear ( ) ;
in_sequence = true ;
}
current_chars . append ( character ) ;
} else {
if ( in_sequence & & current_chars . size ( ) > = 2 ) {
sequences . append ( { sequence_start , current_ip , move ( current_chars ) } ) ;
current_chars . clear ( ) ;
}
in_sequence = false ;
}
state . instruction_position + = opcode . size ( ) ;
}
if ( in_sequence & & current_chars . size ( ) > = 2 ) {
sequences . append ( { sequence_start , state . instruction_position , move ( current_chars ) } ) ;
}
}
if ( sequences . is_empty ( ) )
return ;
2026-01-06 18:10:48 +01:00
BytecodeRewriter rewriter ( bytecode , pattern_value ) ;
2025-12-25 15:02:57 +01:00
Vector < ByteCode > replacements ;
replacements . ensure_capacity ( sequences . size ( ) ) ;
for ( auto const & seq : sequences ) {
StringBuilder string_builder ( StringBuilder : : Mode : : UTF16 ) ;
for ( auto ch : seq . characters )
string_builder . append_code_point ( ch ) ;
ByteCode replacement ;
replacement . insert_bytecode_compare_string ( string_builder . to_utf16_string ( ) ) ;
replacements . append ( move ( replacement ) ) ;
}
2025-12-16 00:33:28 +01:00
2025-12-25 15:02:57 +01:00
parser_result . bytecode = rewriter . rebuild ( bytecode , sequences . span ( ) , replacements . span ( ) ) ;
2025-12-16 00:33:28 +01:00
}
2025-12-22 13:05:47 +01:00
template < typename Parser >
void Regex < Parser > : : attempt_rewrite_dot_star_sequences_as_seek ( BasicBlockList const & basic_blocks )
{
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
if ( basic_blocks . is_empty ( ) ) {
dbgln_if ( REGEX_DEBUG , " No basic blocks, skipping /.*/ rewrite " ) ;
return ;
}
// If a /.*/ sequence is followed by a compare C (with some non-matching ops {O} in between), we can rewrite:
// bbN: {O0} (optional non-matching ops before the pattern)
// ForkStay bbM
// Checkpoint p
// Compare AnyChar
2026-02-03 06:10:52 +01:00
// FailIfEmpty (optional, noop for .*)
2025-12-22 13:05:47 +01:00
// JumpNonEmpty (back to ForkStay) p
// bbM: {O1} (optional non-matching ops)
// Compare C
// as
// bbN: {O0}
// bbR: RSeekTo C
// ForkStay bbR
// bbM: {O1}
// Compare C
//
// Note: bbM is determined by the ForkStay's target, not necessarily the next sequential block
// Note: The pattern may span across multiple basic blocks
struct DotStarCandidate {
size_t fork_ip ;
size_t checkpoint_ip ;
size_t compare_ip ;
size_t jump_ip ;
size_t following_block_start ;
u64 checkpoint_id ;
u32 seek_code_point ;
} ;
2026-01-06 22:31:37 +01:00
OrderedHashMap < size_t , DotStarCandidate > candidates ;
2025-12-22 13:05:47 +01:00
auto state = MatchState : : only_for_enumeration ( ) ;
for ( size_t i = 0 ; i < basic_blocks . size ( ) ; + + i ) {
auto const & block = basic_blocks [ i ] ;
state . instruction_position = block . start ;
if ( state . instruction_position > block . end )
continue ;
// Skip non-matching ops at the start of the block
while ( state . instruction_position < = block . end ) {
auto & op = bytecode . get_opcode ( state ) ;
switch ( op . opcode_id ( ) ) {
case OpCodeId : : Checkpoint :
case OpCodeId : : Save :
case OpCodeId : : SaveLeftCaptureGroup :
case OpCodeId : : SaveRightCaptureGroup :
case OpCodeId : : SaveRightNamedCaptureGroup :
case OpCodeId : : ClearCaptureGroup :
state . instruction_position + = op . size ( ) ;
continue ;
default :
goto found_potential_fork ;
}
}
{
if ( state . instruction_position > = bytecode . size ( ) )
continue ;
auto & op_at_boundary = bytecode . get_opcode ( state ) ;
if ( op_at_boundary . opcode_id ( ) ! = OpCodeId : : ForkStay )
continue ;
}
found_potential_fork :
// (1) ForkStay bbM
dbgln_if ( REGEX_DEBUG , " Examining block {} from {} to {} " , i , block . start , block . end ) ;
auto & first_op = bytecode . get_opcode ( state ) ;
if ( first_op . opcode_id ( ) ! = OpCodeId : : ForkStay ) {
dbgln_if ( REGEX_DEBUG , " did not find ForkStay at {} " , state . instruction_position ) ;
continue ;
}
auto fork_ip = state . instruction_position ;
auto & fork_op = to < OpCode_ForkStay > ( first_op ) ;
// Find the actual following block by the fork target
auto fork_target = fork_ip + fork_op . size ( ) + fork_op . offset ( ) ;
size_t following_block_idx = 0 ;
bool found_following_block = false ;
for ( size_t j = 0 ; j < basic_blocks . size ( ) ; + + j ) {
if ( basic_blocks [ j ] . start = = fork_target ) {
2026-02-03 06:10:52 +01:00
if ( basic_blocks [ j ] . start < = basic_blocks [ j ] . end ) {
2025-12-22 13:05:47 +01:00
following_block_idx = j ;
found_following_block = true ;
break ;
}
continue ;
}
}
if ( ! found_following_block ) {
dbgln_if ( REGEX_DEBUG , " did not find non-empty following block for fork target {} " , fork_target ) ;
continue ;
}
auto const & following_block = basic_blocks [ following_block_idx ] ;
dbgln_if ( REGEX_DEBUG , " Fork target {} is in block {} (from {} to {}) " , fork_target , following_block_idx , following_block . start , following_block . end ) ;
state . instruction_position + = first_op . size ( ) ;
// (2) Checkpoint p
auto & second_op = bytecode . get_opcode ( state ) ;
if ( second_op . opcode_id ( ) ! = OpCodeId : : Checkpoint ) {
dbgln_if ( REGEX_DEBUG , " did not find Checkpoint at {} (found opcode {}) " , state . instruction_position , ( int ) second_op . opcode_id ( ) ) ;
continue ;
}
auto checkpoint_ip = state . instruction_position ;
auto checkpoint_id = to < OpCode_Checkpoint > ( second_op ) . id ( ) ;
state . instruction_position + = second_op . size ( ) ;
// (3) Compare AnyChar
auto & third_op = bytecode . get_opcode ( state ) ;
if ( third_op . opcode_id ( ) ! = OpCodeId : : Compare ) {
dbgln_if ( REGEX_DEBUG , " did not find Compare at {} (found opcode {}) " , state . instruction_position , ( int ) third_op . opcode_id ( ) ) ;
continue ;
}
auto compare_ip = state . instruction_position ;
auto & compare_op = to < OpCode_Compare > ( third_op ) ;
auto flat_compares = compare_op . flat_compares ( ) ;
if ( flat_compares . size ( ) ! = 1 | | flat_compares [ 0 ] . type ! = CharacterCompareType : : AnyChar ) {
dbgln_if ( REGEX_DEBUG , " Compare at {} is not AnyChar " , state . instruction_position ) ;
continue ;
}
state . instruction_position + = third_op . size ( ) ;
2026-02-03 06:10:52 +01:00
// (3.5) Skip FailIfEmpty if present
{
auto & maybe_fail_op = bytecode . get_opcode ( state ) ;
if ( maybe_fail_op . opcode_id ( ) = = OpCodeId : : FailIfEmpty )
state . instruction_position + = maybe_fail_op . size ( ) ;
}
2025-12-22 13:05:47 +01:00
// (4) JumpNonEmpty back to ForkStay
auto & fourth_op = bytecode . get_opcode ( state ) ;
if ( fourth_op . opcode_id ( ) ! = OpCodeId : : JumpNonEmpty ) {
dbgln_if ( REGEX_DEBUG , " did not find JumpNonEmpty at {} (found opcode {}) " , state . instruction_position , ( int ) fourth_op . opcode_id ( ) ) ;
continue ;
}
auto jump_ip = state . instruction_position ;
auto & jump_op = to < OpCode_JumpNonEmpty > ( fourth_op ) ;
if ( jump_ip + jump_op . size ( ) + jump_op . offset ( ) ! = fork_ip ) {
dbgln_if ( REGEX_DEBUG , " JumpNonEmpty at {} does not jump back to ForkStay at {} (instead jumps to {}) " ,
state . instruction_position , fork_ip , jump_ip + jump_op . size ( ) + jump_op . offset ( ) ) ;
continue ;
}
if ( ( size_t ) jump_op . checkpoint ( ) ! = checkpoint_id ) {
dbgln_if ( REGEX_DEBUG , " JumpNonEmpty at {} does not reference Checkpoint id {} (instead references {}) " ,
state . instruction_position , checkpoint_id , jump_op . checkpoint ( ) ) ;
continue ;
}
dbgln_if ( REGEX_DEBUG , " Found .* pattern from IP {} to {} " , fork_ip , jump_ip + jump_op . size ( ) ) ;
// The following block must contain a Compare C, with only non-matching ops in between
state . instruction_position = following_block . start ;
2026-02-03 06:10:52 +01:00
while ( state . instruction_position < = following_block . end ) {
2025-12-22 13:05:47 +01:00
auto & op = bytecode . get_opcode ( state ) ;
switch ( op . opcode_id ( ) ) {
case OpCodeId : : Checkpoint :
case OpCodeId : : Save :
case OpCodeId : : SaveLeftCaptureGroup :
case OpCodeId : : SaveRightCaptureGroup :
case OpCodeId : : SaveRightNamedCaptureGroup :
case OpCodeId : : ClearCaptureGroup :
state . instruction_position + = op . size ( ) ;
continue ;
case OpCodeId : : Compare : {
auto & following_compare_op = to < OpCode_Compare > ( op ) ;
auto following_compares = following_compare_op . flat_compares ( ) ;
StaticallyInterpretedCompares compares ;
if ( ! interpret_compares ( following_compares , compares , & bytecode , true ) ) {
dbgln_if ( REGEX_DEBUG , " could not statically interpret compares at {} in following block " , state . instruction_position ) ;
goto next_block ;
}
// Must be able to pull a single code point from this compare (i.e. a single 1-long range, no negations, no char classes, and no unicode properties)
if ( compares . ranges . size ( ) ! = 1 | | ! compares . negated_ranges . is_empty ( )
| | ! compares . char_classes . is_empty ( ) | | ! compares . negated_char_classes . is_empty ( )
| | compares . has_any_unicode_property
| | ! compares . unicode_general_categories . is_empty ( )
| | ! compares . unicode_properties . is_empty ( )
| | ! compares . unicode_scripts . is_empty ( )
| | ! compares . unicode_script_extensions . is_empty ( )
| | ! compares . negated_unicode_general_categories . is_empty ( )
| | ! compares . negated_unicode_properties . is_empty ( )
| | ! compares . negated_unicode_scripts . is_empty ( )
| | ! compares . negated_unicode_script_extensions . is_empty ( ) ) {
dbgln_if ( REGEX_DEBUG , " compares at {} in following block are too complex to rewrite as SeekTo " , state . instruction_position ) ;
goto next_block ;
}
auto it = compares . ranges . begin ( ) ;
if ( it . key ( ) ! = * it ) {
// Not a single code point
dbgln_if ( REGEX_DEBUG , " compares at {} in following block are a range, not a single code point ({}..{}) " , state . instruction_position , it . key ( ) , * it ) ;
goto next_block ;
}
auto seeked_code_point = it . key ( ) ;
2026-01-06 18:11:16 +01:00
candidates . set ( fork_ip , { fork_ip , checkpoint_ip , compare_ip , jump_ip , following_block . start , checkpoint_id , seeked_code_point } ) ;
2025-12-22 13:05:47 +01:00
dbgln_if ( REGEX_DEBUG , " Found sequence from {} to {} followed by Compare '{}' at {}, can rewrite as SeekTo " ,
fork_ip , jump_ip + 4 , ( char ) seeked_code_point , state . instruction_position ) ;
goto next_block ;
}
default :
dbgln_if ( REGEX_DEBUG , " Hit non-matching, non-skippable opcode {} at {} in following block " , ( int ) op . opcode_id ( ) , state . instruction_position ) ;
goto next_block ;
}
}
next_block :
continue ;
}
dbgln_if ( REGEX_DEBUG , " Found {} dot-star sequences to rewrite as SeekTo " , candidates . size ( ) ) ;
if ( candidates . is_empty ( ) )
return ;
2026-01-06 18:10:48 +01:00
BytecodeRewriter rewriter ( bytecode , pattern_value ) ;
2025-12-22 13:05:47 +01:00
2025-12-25 15:02:57 +01:00
struct Range {
size_t start_ip ;
size_t end_ip ;
} ;
2025-12-22 13:05:47 +01:00
2025-12-25 15:02:57 +01:00
Vector < Range > ranges_to_skip ;
Vector < ByteCode > replacements ;
ranges_to_skip . ensure_capacity ( candidates . size ( ) ) ;
replacements . ensure_capacity ( candidates . size ( ) ) ;
2025-12-22 13:05:47 +01:00
2026-01-06 18:11:16 +01:00
for ( auto & [ _ , candidate ] : candidates ) {
2025-12-25 15:02:57 +01:00
ranges_to_skip . empend ( candidate . fork_ip , candidate . jump_ip + 4 ) ; // JumpNonEmpty = 4
ByteCode replacement ;
replacement . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : RSeekTo ) ) ;
replacement . empend ( candidate . seek_code_point ) ;
replacement . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkStay ) ) ;
replacement . empend ( static_cast < ByteCodeValueType > ( - 4 ) ) ; // Offset back to RSeekTo
replacements . append ( move ( replacement ) ) ;
}
2025-12-22 13:05:47 +01:00
2025-12-25 15:02:57 +01:00
parser_result . bytecode = rewriter . rebuild ( bytecode , ranges_to_skip . span ( ) , replacements . span ( ) ) ;
2025-12-22 13:05:47 +01:00
if constexpr ( REGEX_DEBUG ) {
dbgln ( " After dot-star rewrite as SeekTo: " ) ;
RegexDebug < ByteCode > dbg ;
dbg . print_bytecode ( * this ) ;
}
}
2025-12-22 14:00:19 +01:00
template < typename Parser >
void Regex < Parser > : : rewrite_simple_compares ( BasicBlockList const & basic_blocks )
{
// If a Compare opcode only has a single compare and that's a match opcode
// we can rewrite it as a CompareSimple to avoid the overhead of handling multiple compares:
// Compare argc=1 args=S
// Char 'a'
// -->
// CompareSimple args=S
// Char 'a'
auto & bytecode = parser_result . bytecode . get < ByteCode > ( ) ;
if ( basic_blocks . is_empty ( ) )
return ;
struct SimpleCompareCandidate {
size_t compare_ip ;
Vector < ByteCodeValueType > compare_data ;
} ;
Vector < SimpleCompareCandidate > candidates ;
auto state = MatchState : : only_for_enumeration ( ) ;
for ( auto const & block : basic_blocks ) {
for ( state . instruction_position = block . start ; state . instruction_position < = block . end ; ) {
auto current_ip = state . instruction_position ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( opcode . opcode_id ( ) = = OpCodeId : : Compare ) {
auto & compare = to < OpCode_Compare > ( opcode ) ;
auto flat_compares = compare . flat_compares ( ) ;
if ( flat_compares . size ( ) = = 1 & & ! first_is_one_of ( flat_compares [ 0 ] . type , CharacterCompareType : : And , CharacterCompareType : : Or , CharacterCompareType : : Inverse , CharacterCompareType : : TemporaryInverse , CharacterCompareType : : Subtract , CharacterCompareType : : Undefined ) ) {
auto slice = bytecode . spans ( ) . slice ( current_ip + 2 , opcode . size ( ) - 2 ) ;
Vector < ByteCodeValueType > data ;
data . ensure_capacity ( slice . size ( ) ) ;
for ( auto value : slice )
data . append ( value ) ;
candidates . append ( { current_ip , move ( data ) } ) ; // +2 to skip opcode id and argc
}
}
state . instruction_position + = opcode . size ( ) ;
}
}
if ( candidates . is_empty ( ) )
return ;
dbgln_if ( REGEX_DEBUG , " Found {} simple compare candidates to rewrite " , candidates . size ( ) ) ;
2026-01-06 18:10:48 +01:00
BytecodeRewriter rewriter ( bytecode , pattern_value ) ;
2025-12-22 14:00:19 +01:00
for ( auto & candidate : candidates ) {
auto & instr = * rewriter . instructions . find_if ( [ & ] ( auto & i ) { return i . old_ip = = candidate . compare_ip ; } ) ;
instr . skip = true ;
}
size_t candidate_index = 0 ;
auto insert_replacement = [ & ] ( auto const & instr , ByteCode & result ) {
while ( candidate_index < candidates . size ( ) ) {
auto & candidate = candidates [ candidate_index ] ;
if ( instr . old_ip = = candidate . compare_ip ) {
result . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : CompareSimple ) ) ;
result . extend ( move ( candidate . compare_data ) ) ;
candidate_index + + ;
return ;
}
if ( instr . old_ip < candidate . compare_ip )
return ;
candidate_index + + ;
}
} ;
parser_result . bytecode = rewriter . rebuild ( bytecode , move ( insert_replacement ) ) ;
if constexpr ( REGEX_DEBUG ) {
dbgln ( " After simple compare rewrite: " ) ;
RegexDebug < ByteCode > dbg ;
dbg . print_bytecode ( * this ) ;
}
}
2021-09-13 23:45:22 +04:30
void Optimizer : : append_alternation ( ByteCode & target , ByteCode & & left , ByteCode & & right )
2021-09-12 17:30:27 +04:30
{
2022-02-20 03:25:21 +03:30
Array < ByteCode , 2 > alternatives ;
alternatives [ 0 ] = move ( left ) ;
alternatives [ 1 ] = move ( right ) ;
append_alternation ( target , alternatives ) ;
}
2023-07-28 21:02:34 +03:30
template < typename K , typename V , typename KTraits >
using OrderedHashMapForTrie = OrderedHashMap < K , V , KTraits > ;
2022-02-20 03:25:21 +03:30
void Optimizer : : append_alternation ( ByteCode & target , Span < ByteCode > alternatives )
{
2025-05-17 18:01:44 +02:00
// Assume we have N alternatives A0..AN, each with M basic blocks bb0..bbM, each with I instructions 0..I (denoted Ai.bbj[k])
// We can create the alternation is two ways:
// - Lay them out sequentially, such that A0 is tried, then A1, then A2, etc.
// - Generate a prefix tree for A*.bb*[*], and walk the tree at runtime.
// For the first case, assuming we have two A0.bb0[0..2] and A1.bb0[0..2]:
// out.bb0:
// ForkStay out.bb1
// A0.bb0[*]
// Jump out.bb2
// out.bb1:
// A1.bb0[*]
// out.bb2:
// <end>
// For the second case, assuming the following alternatives:
// A0.bb0:
// Compare 'a'
// Compare 'b'
// Compare 'd'
// A1.bb0:
// Compare 'a'
// Compare 'c'
// Compare 'd'
// We can first generate a prefix tree (trie here), with each node denoted by [insn, insn*]:
// (root)
// |- [A0.bb0[0], A1.bb0[0]]
// | |- [A0.bb0[1]]
// | | |- [A0.bb0[2]]
// | |- [A1.bb0[1]]
// | | |- [A1.bb0[2]]
// i.e. the first instruction of A0 and A1 are the same, so we can merge them into one node;
// everything following that is different (A1.bb0[2] is not considered equivalent to A0.bb0[2] as they are jumped-to by different instructions,
// in this case their previous instruction)
// Then, each trie node N { insn, children } can be represented as:
// out for N:
// N.insn[*]
// ForkJump out for N.children[0]
// ForkJump out for N.children[1]
// ...
// or if there's a single child, we can directly jump to it:
// out for N: // if N.children.size() == 1
// N.insn[*]
// Jump out for N.children[0]
// For our example, this would yield:
// out for root:
// Jump out for [A0.bb0[0], A1.bb0[0]]
// out for [A0.bb0[0], A1.bb0[0]]:
// Compare 'a'
// ForkJump out for A0.bb0[1]
// ForkJump out for A1.bb0[1]
// out for A0.bb0[1]:
// Compare 'b'
// Jump out for A0.bb0[2]
// out for A1.bb0[1]:
// Compare 'c'
// Jump out for A1.bb0[2]
// out for A0.bb0[2]:
// Compare 'd'
// out for A1.bb0[2]:
// Compare 'd'
2022-02-20 03:25:21 +03:30
if ( alternatives . size ( ) = = 0 )
return ;
if ( alternatives . size ( ) = = 1 )
return target . extend ( move ( alternatives [ 0 ] ) ) ;
2025-04-06 02:10:01 +02:00
target . merge_string_tables_from ( alternatives ) ;
2022-02-20 03:25:21 +03:30
if ( all_of ( alternatives , [ ] ( auto & x ) { return x . is_empty ( ) ; } ) )
2021-09-12 17:30:27 +04:30
return ;
2022-02-20 03:25:21 +03:30
for ( auto & entry : alternatives )
entry . flatten ( ) ;
# if REGEX_DEBUG
ScopeLogger < true > log ;
warnln ( " Alternations: " ) ;
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2022-02-20 03:25:21 +03:30
for ( auto & entry : alternatives ) {
warnln ( " ---------- " ) ;
dbg . print_bytecode ( entry ) ;
2021-09-12 17:30:27 +04:30
}
2022-02-20 03:25:21 +03:30
ScopeGuard print_at_end {
[ & ] {
warnln ( " ====================== " ) ;
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2022-02-20 03:25:21 +03:30
dbg . print_bytecode ( target ) ;
}
} ;
# endif
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
// First, find incoming jump edges.
// We need them for two reasons:
// - We need to distinguish between insn-A-jumped-to-by-insn-B and insn-A-jumped-to-by-insn-C (as otherwise we'd break trie invariants)
// - We need to know which jumps to patch when we're done
2021-12-15 08:52:07 +03:30
2023-07-28 21:02:34 +03:30
struct JumpEdge {
Span < ByteCodeValueType const > jump_insn ;
} ;
Vector < HashMap < size_t , Vector < JumpEdge > > > incoming_jump_edges_for_each_alternative ;
incoming_jump_edges_for_each_alternative . resize ( alternatives . size ( ) ) ;
2021-12-15 08:52:07 +03:30
2023-08-05 17:32:47 +03:30
auto has_any_backwards_jump = false ;
2025-04-15 15:31:08 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2022-02-20 03:25:21 +03:30
2023-07-28 21:02:34 +03:30
for ( size_t i = 0 ; i < alternatives . size ( ) ; + + i ) {
auto & alternative = alternatives [ i ] ;
// Add a jump to the "end" of the block; this is implicit in the bytecode, but we need it to be explicit in the trie.
// Jump{offset=0}
alternative . append ( static_cast < ByteCodeValueType > ( OpCodeId : : Jump ) ) ;
alternative . append ( 0 ) ;
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
auto & incoming_jump_edges = incoming_jump_edges_for_each_alternative [ i ] ;
auto alternative_bytes = alternative . spans < 1 > ( ) . singular_span ( ) ;
for ( state . instruction_position = 0 ; state . instruction_position < alternative . size ( ) ; ) {
auto & opcode = alternative . get_opcode ( state ) ;
auto opcode_bytes = alternative_bytes . slice ( state . instruction_position , opcode . size ( ) ) ;
switch ( opcode . opcode_id ( ) ) {
2025-07-29 01:25:28 +12:00
case OpCodeId : : Jump : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_Jump > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
case OpCodeId : : JumpNonEmpty : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_JumpNonEmpty > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
case OpCodeId : : ForkJump : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_ForkJump > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
case OpCodeId : : ForkStay : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_ForkStay > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
case OpCodeId : : ForkReplaceJump : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_ForkReplaceJump > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
case OpCodeId : : ForkReplaceStay : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_ForkReplaceStay > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf : {
auto const & cast_opcode = to < OpCode_ForkIf > ( opcode ) ;
incoming_jump_edges . ensure ( cast_opcode . offset ( ) + cast_opcode . size ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
has_any_backwards_jump | = cast_opcode . offset ( ) < 0 ;
break ;
}
2025-07-29 01:25:28 +12:00
case OpCodeId : : Repeat : {
2025-12-16 00:27:18 +01:00
auto const & cast_opcode = to < OpCode_Repeat > ( opcode ) ;
2025-07-29 01:25:28 +12:00
incoming_jump_edges . ensure ( state . instruction_position - cast_opcode . offset ( ) ) . append ( { opcode_bytes } ) ;
2023-08-05 17:32:47 +03:30
has_any_backwards_jump = true ;
2023-07-28 21:02:34 +03:30
break ;
2025-07-29 01:25:28 +12:00
}
2023-07-28 21:02:34 +03:30
default :
2022-02-20 03:25:21 +03:30
break ;
}
2023-07-28 21:02:34 +03:30
state . instruction_position + = opcode . size ( ) ;
2023-06-13 23:25:00 +03:30
}
}
2023-07-28 21:02:34 +03:30
struct QualifiedIP {
size_t alternative_index ;
size_t instruction_position ;
2025-11-08 10:59:02 +01:00
bool operator = = ( QualifiedIP const & other ) const = default ;
2023-07-28 21:02:34 +03:30
} ;
2025-05-17 21:50:37 +02:00
struct NodeMetadataEntry {
QualifiedIP ip ;
NonnullOwnPtr < StaticallyInterpretedCompares > first_compare_from_here ;
} ;
using Tree = Trie < DisjointSpans < ByteCodeValueType const > , Vector < NodeMetadataEntry > , Traits < DisjointSpans < ByteCodeValueType const > > , void , OrderedHashMapForTrie > ;
2023-07-28 21:02:34 +03:30
Tree trie { { } } ; // Root node is empty, key{ instruction_bytes, dependent_instruction_bytes... } -> IP
size_t common_hits = 0 ;
size_t total_nodes = 0 ;
size_t total_bytecode_entries_in_tree = 0 ;
for ( size_t i = 0 ; i < alternatives . size ( ) ; + + i ) {
auto & alternative = alternatives [ i ] ;
auto & incoming_jump_edges = incoming_jump_edges_for_each_alternative [ i ] ;
auto * active_node = & trie ;
auto alternative_span = alternative . spans < 1 > ( ) . singular_span ( ) ;
for ( state . instruction_position = 0 ; state . instruction_position < alternative_span . size ( ) ; ) {
total_nodes + = 1 ;
auto & opcode = alternative . get_opcode ( state ) ;
auto opcode_bytes = alternative_span . slice ( state . instruction_position , opcode . size ( ) ) ;
Vector < Span < ByteCodeValueType const > > node_key_bytes ;
node_key_bytes . append ( opcode_bytes ) ;
if ( auto edges = incoming_jump_edges . get ( state . instruction_position ) ; edges . has_value ( ) ) {
for ( auto & edge : * edges )
node_key_bytes . append ( edge . jump_insn ) ;
2023-06-13 23:25:00 +03:30
}
2021-09-12 17:30:27 +04:30
2025-11-20 23:47:21 +01:00
active_node = static_cast < decltype ( active_node ) > ( MUST ( active_node - > ensure_child ( DisjointSpans < ByteCodeValueType const > { move ( node_key_bytes ) } , [ ] - > Vector < NodeMetadataEntry > { return { } ; } ) ) ) ;
2021-09-12 17:30:27 +04:30
2025-05-17 21:50:37 +02:00
auto next_compare = [ & alternative , & state ] ( StaticallyInterpretedCompares & compares ) {
TemporaryChange state_change { state . instruction_position , state . instruction_position } ;
auto * opcode = & alternative . get_opcode ( state ) ;
auto opcode_id = opcode - > opcode_id ( ) ;
while ( opcode_id = = OpCodeId : : Checkpoint | | opcode_id = = OpCodeId : : SaveLeftCaptureGroup
| | opcode_id = = OpCodeId : : SaveRightCaptureGroup | | opcode_id = = OpCodeId : : SaveRightNamedCaptureGroup
| | opcode_id = = OpCodeId : : Save ) {
state . instruction_position + = opcode - > size ( ) ;
opcode = & alternative . get_opcode ( state ) ;
opcode_id = opcode - > opcode_id ( ) ;
}
// We found something functional, if it's a compare, we need to care.
if ( opcode_id ! = OpCodeId : : Compare )
return ;
2025-12-16 00:27:18 +01:00
auto flat_compares = to < OpCode_Compare > ( * opcode ) . flat_compares ( ) ;
2025-05-17 21:50:37 +02:00
interpret_compares ( flat_compares , compares ) ;
} ;
auto node_metadata = NodeMetadataEntry { { i , state . instruction_position } , make < StaticallyInterpretedCompares > ( ) } ;
2025-11-08 10:59:02 +01:00
auto & metadata = active_node - > metadata_value ( ) ;
if ( metadata . is_empty ( ) )
2023-07-28 21:02:34 +03:30
total_bytecode_entries_in_tree + = opcode . size ( ) ;
2025-11-08 10:59:02 +01:00
else
common_hits + + ;
metadata . append ( move ( node_metadata ) ) ;
2025-05-17 21:50:37 +02:00
next_compare ( * active_node - > metadata_value ( ) . last ( ) . first_compare_from_here ) ;
2023-07-28 21:02:34 +03:30
state . instruction_position + = opcode . size ( ) ;
2022-02-20 03:25:21 +03:30
}
2021-09-13 23:45:22 +04:30
}
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
if constexpr ( REGEX_DEBUG ) {
Function < void ( decltype ( trie ) & , size_t ) > print_tree = [ & ] ( decltype ( trie ) & node , size_t indent = 0 ) mutable {
2023-12-16 17:49:34 +03:30
ByteString name = " (no ip) " ;
ByteString insn ;
2023-07-28 21:02:34 +03:30
if ( node . has_metadata ( ) ) {
2023-12-16 17:49:34 +03:30
name = ByteString : : formatted (
2023-07-28 21:02:34 +03:30
" {}@{} ({} node{}) " ,
2025-05-17 21:50:37 +02:00
node . metadata_value ( ) . first ( ) . ip . instruction_position ,
node . metadata_value ( ) . first ( ) . ip . alternative_index ,
2023-07-28 21:02:34 +03:30
node . metadata_value ( ) . size ( ) ,
node . metadata_value ( ) . size ( ) = = 1 ? " " : " s " ) ;
2025-04-15 15:31:08 +02:00
auto state = MatchState : : only_for_enumeration ( ) ;
2025-05-17 21:50:37 +02:00
state . instruction_position = node . metadata_value ( ) . first ( ) . ip . instruction_position ;
auto & opcode = alternatives [ node . metadata_value ( ) . first ( ) . ip . alternative_index ] . get_opcode ( state ) ;
2023-12-16 17:49:34 +03:30
insn = ByteString : : formatted ( " {} {} " , opcode . to_byte_string ( ) , opcode . arguments_string ( ) ) ;
2023-07-28 21:02:34 +03:30
}
dbgln ( " {:->{}}| {} -- {} " , " " , indent * 2 , name , insn ) ;
for ( auto & child : node . children ( ) )
print_tree ( static_cast < decltype ( trie ) & > ( * child . value ) , indent + 1 ) ;
} ;
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
print_tree ( trie , 0 ) ;
2022-02-20 03:25:21 +03:30
}
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
// This is really only worth it if we don't blow up the size by the 2-extra-instruction-per-node scheme, similarly, if no nodes are shared, we're better off not using a tree.
auto tree_cost = ( total_nodes - common_hits ) * 2 ;
2025-04-23 14:20:48 +02:00
auto chain_cost = total_bytecode_entries_in_tree + alternatives . size ( ) * 2 ;
2023-07-28 21:02:34 +03:30
dbgln_if ( REGEX_DEBUG , " Total nodes: {}, common hits: {} (tree cost = {}, chain cost = {}) " , total_nodes , common_hits , tree_cost , chain_cost ) ;
2025-05-17 21:50:37 +02:00
// Make sure we're not breaking the order requirements (a should be tried before b in a|b)
Queue < Tree : : DetailTrie * > nodes_to_visit ;
nodes_to_visit . enqueue ( & trie ) ;
while ( ! nodes_to_visit . is_empty ( ) ) {
auto & node = * nodes_to_visit . dequeue ( ) ;
auto & children = node . children ( ) ;
for ( auto & entry : children )
nodes_to_visit . enqueue ( entry . value . ptr ( ) ) ;
// If the children are not sorted right, we've got a problem.
if ( children . size ( ) < = 1 )
continue ;
size_t max_index = 0 ;
NodeMetadataEntry const * child_with_max_index = nullptr ;
for ( auto & entry : children ) {
auto & child = * entry . value ;
if ( child . has_metadata ( ) ) {
for ( auto & child_entry : child . metadata_value ( ) ) {
if ( max_index > child_entry . ip . alternative_index ) {
// We have a problem, an alternative later in the list is being tried before an earlier one.
// we can't use this trie...unless the first compare in this child is not the same as the one in the entry with max-index
// then there's no overlap and the order doesn't matter anyhow.
if ( ! has_overlap ( * child_with_max_index - > first_compare_from_here , * child_entry . first_compare_from_here ) ) {
// We can use this trie after all.
continue ;
}
tree_cost = NumericLimits < size_t > : : max ( ) ;
goto exit_useless_loop ;
}
max_index = child_entry . ip . alternative_index ;
child_with_max_index = & child_entry ;
}
}
}
continue ;
exit_useless_loop :
break ;
}
2023-07-28 21:02:34 +03:30
if ( common_hits = = 0 | | tree_cost > chain_cost ) {
2026-01-06 18:10:48 +01:00
dbgln_if ( REGEX_DEBUG , " Choosing sequential alternation layout over trie-based layout " ) ;
2023-07-28 21:02:34 +03:30
// It's better to lay these out as a normal sequence of instructions.
2025-12-31 07:50:52 +01:00
// We can avoid trying alternatives that we know cannot match in certain cases:
// - If the alternative starts with an assertion, we can lift the assertion to the fork op itself (currently only ^).
Vector < ForkIfCondition > fork_conditions ;
fork_conditions . resize_with_default_value ( alternatives . size ( ) , ForkIfCondition : : Invalid ) ;
for ( size_t i = 0 ; i < alternatives . size ( ) ; + + i ) {
auto & alternative = alternatives [ i ] ;
auto state = MatchState : : only_for_enumeration ( ) ;
state . instruction_position = 0 ;
auto & first_opcode = alternative . get_opcode ( state ) ;
if ( first_opcode . opcode_id ( ) = = OpCodeId : : CheckBegin )
fork_conditions [ i ] = ForkIfCondition : : AtStartOfLine ;
2022-02-20 03:25:21 +03:30
}
2021-09-12 17:30:27 +04:30
2025-12-31 07:50:52 +01:00
Vector < size_t > jump_op_positions ;
Vector < size_t > jump_sizes ;
jump_op_positions . resize ( alternatives . size ( ) - 1 ) ;
jump_sizes . resize ( alternatives . size ( ) - 1 ) ;
for ( size_t i = 1 ; i < alternatives . size ( ) ; + + i ) {
jump_op_positions [ i - 1 ] = target . size ( ) ;
if ( fork_conditions [ i - 1 ] ! = ForkIfCondition : : Invalid ) {
jump_sizes [ i - 1 ] = 4 ;
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkIf ) ) ;
target . empend ( 0u ) ; // To be filled later.
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkJump ) ) ;
target . empend ( static_cast < ByteCodeValueType > ( fork_conditions [ i - 1 ] ) ) ;
} else {
jump_sizes [ i - 1 ] = 2 ;
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkJump ) ) ;
target . empend ( 0u ) ; // To be filled later.
2023-07-28 21:02:34 +03:30
}
2025-12-31 07:50:52 +01:00
}
2022-02-20 03:25:21 +03:30
2025-12-31 07:50:52 +01:00
bool seen_one_empty = false ;
2022-02-20 03:25:21 +03:30
2025-12-31 07:50:52 +01:00
Vector < size_t > jump_to_end_patch_positions ;
jump_to_end_patch_positions . resize_with_default_value ( alternatives . size ( ) , NumericLimits < size_t > : : max ( ) ) ;
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
for ( size_t i = alternatives . size ( ) ; i > 0 ; - - i ) {
auto & chunk = alternatives [ i - 1 ] ;
if ( chunk . is_empty ( ) ) {
if ( seen_one_empty )
2022-02-20 03:25:21 +03:30
continue ;
2023-07-28 21:02:34 +03:30
seen_one_empty = true ;
}
2025-12-31 07:50:52 +01:00
if ( i < alternatives . size ( ) ) {
auto this_block_start = target . size ( ) ;
auto position = jump_op_positions [ i - 1 ] ;
auto jump_size = jump_sizes [ i - 1 ] ;
target [ position + 1 ] = static_cast < ByteCodeValueType > ( this_block_start - position - jump_size ) ;
2023-07-28 21:02:34 +03:30
}
target . extend ( move ( chunk ) ) ;
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : Jump ) ) ;
2025-12-31 07:50:52 +01:00
target . empend ( 0u ) ; // Jump to the _END label
jump_to_end_patch_positions [ i - 1 ] = target . size ( ) - 1 ;
}
auto end_position = target . size ( ) ;
for ( size_t i = 0 ; i < alternatives . size ( ) ; + + i ) {
if ( auto & position = jump_to_end_patch_positions [ i ] ; position ! = NumericLimits < size_t > : : max ( ) )
target [ position ] = static_cast < ByteCodeValueType > ( end_position - ( position + 1 ) ) ;
2023-07-28 21:02:34 +03:30
}
} else {
2026-01-06 18:10:48 +01:00
dbgln_if ( REGEX_DEBUG , " Choosing trie-based alternation layout " ) ;
2023-07-28 21:02:34 +03:30
target . ensure_capacity ( total_bytecode_entries_in_tree + common_hits * 6 ) ;
auto node_is = [ ] ( Tree const * node , QualifiedIP ip ) {
2025-11-08 10:59:02 +01:00
return node - > metadata_value ( ) . span ( ) . first_matching ( [ & ] ( auto & entry ) { return entry . ip = = ip ; } ) . has_value ( ) ;
2023-07-28 21:02:34 +03:30
} ;
struct Patch {
QualifiedIP source_ip ;
size_t target_ip ;
2026-01-06 18:10:48 +01:00
size_t size_delta { 0 } ;
2023-07-28 21:02:34 +03:30
bool done { false } ;
} ;
Vector < Patch > patch_locations ;
patch_locations . ensure_capacity ( total_nodes ) ;
2023-08-05 17:32:47 +03:30
HashMap < size_t , NonnullOwnPtr < RedBlackTree < u64 , u64 > > > instruction_positions ;
if ( has_any_backwards_jump )
MUST ( instruction_positions . try_ensure_capacity ( alternatives . size ( ) ) ) ;
auto ip_mapping_for_alternative = [ & ] ( size_t i ) - > RedBlackTree < u64 , u64 > & {
return * instruction_positions . ensure ( i , [ ] {
return make < RedBlackTree < u64 , u64 > > ( ) ;
} ) ;
} ;
2025-04-23 18:20:49 +02:00
auto add_patch_point = [ & ] ( Tree const * node , size_t target_ip ) {
if ( ! node - > has_metadata ( ) )
return ;
2025-05-17 21:50:37 +02:00
patch_locations . append ( { node - > metadata_value ( ) . first ( ) . ip , target_ip } ) ;
2025-04-23 18:20:49 +02:00
} ;
Vector < Tree * > nodes_to_visit ;
nodes_to_visit . append ( & trie ) ;
2023-07-28 21:02:34 +03:30
// each node:
// node.re
// forkjump child1
// forkjump child2
// ...
while ( ! nodes_to_visit . is_empty ( ) ) {
2025-02-24 08:41:04 +01:00
auto const * node = nodes_to_visit . take_last ( ) ;
2023-07-28 21:02:34 +03:30
for ( auto & patch : patch_locations ) {
if ( ! patch . done & & node_is ( node , patch . source_ip ) ) {
2026-01-06 18:10:48 +01:00
auto value = static_cast < ByteCodeValueType > ( target . size ( ) - patch . target_ip - 1 - patch . size_delta ) ;
2024-11-17 16:01:15 +01:00
if ( value = = 0 )
target [ patch . target_ip - 1 ] = static_cast < ByteCodeValueType > ( OpCodeId : : Jump ) ;
2023-07-28 21:02:34 +03:30
target [ patch . target_ip ] = value ;
patch . done = true ;
}
}
if ( ! node - > value ( ) . individual_spans ( ) . is_empty ( ) ) {
auto insn_bytes = node - > value ( ) . individual_spans ( ) . first ( ) ;
target . ensure_capacity ( target . size ( ) + insn_bytes . size ( ) ) ;
state . instruction_position = target . size ( ) ;
target . append ( insn_bytes ) ;
2023-08-05 17:32:47 +03:30
if ( has_any_backwards_jump ) {
2025-05-17 21:50:37 +02:00
for ( auto & entry : node - > metadata_value ( ) )
ip_mapping_for_alternative ( entry . ip . alternative_index ) . insert ( entry . ip . instruction_position , state . instruction_position ) ;
2023-08-05 17:32:47 +03:30
}
2023-07-28 21:02:34 +03:30
auto & opcode = target . get_opcode ( state ) ;
ssize_t jump_offset ;
auto is_jump = true ;
auto patch_location = state . instruction_position + 1 ;
2024-12-11 10:56:32 +01:00
bool should_negate = false ;
2026-01-06 18:10:48 +01:00
size_t size_delta = opcode . size ( ) - 2 ;
2023-07-28 21:02:34 +03:30
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
2025-12-16 00:27:18 +01:00
jump_offset = to < OpCode_Jump > ( opcode ) . offset ( ) ;
2023-07-28 21:02:34 +03:30
break ;
case OpCodeId : : JumpNonEmpty :
2025-12-16 00:27:18 +01:00
jump_offset = to < OpCode_JumpNonEmpty > ( opcode ) . offset ( ) ;
2023-07-28 21:02:34 +03:30
break ;
case OpCodeId : : ForkJump :
2025-12-16 00:27:18 +01:00
jump_offset = to < OpCode_ForkJump > ( opcode ) . offset ( ) ;
2023-07-28 21:02:34 +03:30
break ;
case OpCodeId : : ForkStay :
2025-12-16 00:27:18 +01:00
jump_offset = to < OpCode_ForkStay > ( opcode ) . offset ( ) ;
2023-07-28 21:02:34 +03:30
break ;
case OpCodeId : : ForkReplaceJump :
2025-12-16 00:27:18 +01:00
jump_offset = to < OpCode_ForkReplaceJump > ( opcode ) . offset ( ) ;
2023-07-28 21:02:34 +03:30
break ;
case OpCodeId : : ForkReplaceStay :
2025-12-16 00:27:18 +01:00
jump_offset = to < OpCode_ForkReplaceStay > ( opcode ) . offset ( ) ;
2023-07-28 21:02:34 +03:30
break ;
2025-12-31 07:50:52 +01:00
case OpCodeId : : ForkIf :
jump_offset = to < OpCode_ForkIf > ( opcode ) . offset ( ) ;
break ;
2023-07-28 21:02:34 +03:30
case OpCodeId : : Repeat :
2025-12-16 00:27:18 +01:00
jump_offset = static_cast < ssize_t > ( 0 ) - static_cast < ssize_t > ( to < OpCode_Repeat > ( opcode ) . offset ( ) ) - static_cast < ssize_t > ( opcode . size ( ) ) ;
2024-12-11 10:56:32 +01:00
should_negate = true ;
2023-07-28 21:02:34 +03:30
break ;
default :
is_jump = false ;
break ;
}
if ( is_jump ) {
VERIFY ( node - > has_metadata ( ) ) ;
2025-04-23 18:20:49 +02:00
if ( node - > metadata_value ( ) . size ( ) > 1 )
target [ patch_location ] = static_cast < ByteCodeValueType > ( 0 ) ; // Fall through instead.
auto only_one = node - > metadata_value ( ) . size ( ) = = 1 ;
auto patch_size = opcode . size ( ) - 1 ;
2025-05-17 21:50:37 +02:00
for ( auto & entry : node - > metadata_value ( ) ) {
auto & [ alternative_index , instruction_position ] = entry . ip ;
2025-04-23 18:20:49 +02:00
if ( ! only_one ) {
target . append ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkJump ) ) ;
patch_location = target . size ( ) ;
should_negate = false ;
2025-04-25 20:43:33 +02:00
patch_size = 1 ;
2025-04-23 18:20:49 +02:00
target . append ( static_cast < ByteCodeValueType > ( 0 ) ) ;
}
2023-08-05 17:32:47 +03:30
2025-04-23 18:20:49 +02:00
auto intended_jump_ip = instruction_position + jump_offset + opcode . size ( ) ;
if ( jump_offset < 0 ) {
VERIFY ( has_any_backwards_jump ) ;
// We should've already seen this instruction, so we can just patch it in.
auto & ip_mapping = ip_mapping_for_alternative ( alternative_index ) ;
auto target_ip = ip_mapping . find ( intended_jump_ip ) ;
if ( ! target_ip ) {
2025-12-16 00:27:18 +01:00
RegexDebug < ByteCode > dbg ;
2025-04-23 18:20:49 +02:00
size_t x = 0 ;
for ( auto & entry : alternatives ) {
warnln ( " ----------- {} ---------- " , x + + ) ;
dbg . print_bytecode ( entry ) ;
}
dbgln ( " Regex Tree / Unknown backwards jump: {}@{} -> {} " ,
instruction_position ,
alternative_index ,
intended_jump_ip ) ;
VERIFY_NOT_REACHED ( ) ;
}
ssize_t target_value = * target_ip - patch_location - patch_size ;
if ( should_negate )
2025-04-24 08:22:44 +02:00
target_value = - target_value - opcode . size ( ) ;
2025-04-23 18:20:49 +02:00
target [ patch_location ] = static_cast < ByteCodeValueType > ( target_value ) ;
} else {
2026-01-06 18:10:48 +01:00
patch_locations . append ( { QualifiedIP { alternative_index , intended_jump_ip } , patch_location , size_delta } ) ;
2023-08-05 17:32:47 +03:30
}
}
2023-07-28 21:02:34 +03:30
}
}
for ( auto const & child : node - > children ( ) ) {
auto * child_node = static_cast < Tree * > ( child . value . ptr ( ) ) ;
target . append ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkJump ) ) ;
add_patch_point ( child_node , target . size ( ) ) ;
target . append ( static_cast < ByteCodeValueType > ( 0 ) ) ;
2025-02-24 08:41:04 +01:00
nodes_to_visit . append ( child_node ) ;
2022-02-20 03:25:21 +03:30
}
}
2023-07-28 21:02:34 +03:30
for ( auto & patch : patch_locations ) {
if ( patch . done )
continue ;
auto & alternative = alternatives [ patch . source_ip . alternative_index ] ;
if ( patch . source_ip . instruction_position > = alternative . size ( ) ) {
// This just wants to jump to the end of the alternative, which is fine.
// Patch it to jump to the end of the target instead.
target [ patch . target_ip ] = static_cast < ByteCodeValueType > ( target . size ( ) - patch . target_ip - 1 ) ;
continue ;
}
2021-09-12 17:30:27 +04:30
2023-07-28 21:02:34 +03:30
dbgln ( " Regex Tree / Unpatched jump: {}@{} -> {}@{} " ,
patch . source_ip . instruction_position ,
patch . source_ip . alternative_index ,
patch . target_ip ,
target [ patch . target_ip ] ) ;
VERIFY_NOT_REACHED ( ) ;
}
2022-02-20 03:25:21 +03:30
}
2021-09-12 17:30:27 +04:30
}
2021-10-03 19:01:25 +03:30
enum class LookupTableInsertionOutcome {
Successful ,
ReplaceWithAnyChar ,
TemporaryInversionNeeded ,
PermanentInversionNeeded ,
2022-07-20 23:22:07 +04:30
FlushOnInsertion ,
FinishFlushOnInsertion ,
2021-10-03 19:01:25 +03:30
CannotPlaceInTable ,
} ;
static LookupTableInsertionOutcome insert_into_lookup_table ( RedBlackTree < ByteCodeValueType , CharRange > & table , CompareTypeAndValuePair pair )
{
switch ( pair . type ) {
case CharacterCompareType : : Inverse :
return LookupTableInsertionOutcome : : PermanentInversionNeeded ;
case CharacterCompareType : : TemporaryInverse :
return LookupTableInsertionOutcome : : TemporaryInversionNeeded ;
case CharacterCompareType : : AnyChar :
return LookupTableInsertionOutcome : : ReplaceWithAnyChar ;
case CharacterCompareType : : CharClass :
return LookupTableInsertionOutcome : : CannotPlaceInTable ;
case CharacterCompareType : : Char :
table . insert ( pair . value , { ( u32 ) pair . value , ( u32 ) pair . value } ) ;
break ;
case CharacterCompareType : : CharRange : {
CharRange range { pair . value } ;
table . insert ( range . from , range ) ;
break ;
}
2022-07-20 23:22:07 +04:30
case CharacterCompareType : : EndAndOr :
return LookupTableInsertionOutcome : : FinishFlushOnInsertion ;
case CharacterCompareType : : And :
2025-11-09 13:35:16 +01:00
case CharacterCompareType : : Subtract :
2022-07-20 23:22:07 +04:30
return LookupTableInsertionOutcome : : FlushOnInsertion ;
2021-10-03 19:01:25 +03:30
case CharacterCompareType : : Reference :
2025-07-23 20:48:34 +02:00
case CharacterCompareType : : NamedReference :
2021-10-03 19:01:25 +03:30
case CharacterCompareType : : Property :
case CharacterCompareType : : GeneralCategory :
case CharacterCompareType : : Script :
case CharacterCompareType : : ScriptExtension :
2025-11-09 13:35:16 +01:00
case CharacterCompareType : : StringSet :
2022-07-20 23:22:07 +04:30
case CharacterCompareType : : Or :
2021-10-03 19:01:25 +03:30
return LookupTableInsertionOutcome : : CannotPlaceInTable ;
case CharacterCompareType : : Undefined :
case CharacterCompareType : : RangeExpressionDummy :
case CharacterCompareType : : String :
case CharacterCompareType : : LookupTable :
VERIFY_NOT_REACHED ( ) ;
}
return LookupTableInsertionOutcome : : Successful ;
}
void Optimizer : : append_character_class ( ByteCode & target , Vector < CompareTypeAndValuePair > & & pairs )
{
ByteCode arguments ;
size_t argument_count = 0 ;
if ( pairs . size ( ) < = 1 ) {
for ( auto & pair : pairs ) {
arguments . append ( to_underlying ( pair . type ) ) ;
2022-07-20 23:22:07 +04:30
if ( pair . type ! = CharacterCompareType : : AnyChar
& & pair . type ! = CharacterCompareType : : TemporaryInverse
& & pair . type ! = CharacterCompareType : : Inverse
& & pair . type ! = CharacterCompareType : : And
& & pair . type ! = CharacterCompareType : : Or
2025-11-09 13:35:16 +01:00
& & pair . type ! = CharacterCompareType : : Subtract
2022-07-20 23:22:07 +04:30
& & pair . type ! = CharacterCompareType : : EndAndOr )
2021-10-03 19:01:25 +03:30
arguments . append ( pair . value ) ;
+ + argument_count ;
}
} else {
RedBlackTree < ByteCodeValueType , CharRange > table ;
RedBlackTree < ByteCodeValueType , CharRange > inverted_table ;
auto * current_table = & table ;
auto * current_inverted_table = & inverted_table ;
bool invert_for_next_iteration = false ;
bool is_currently_inverted = false ;
2022-07-10 11:42:31 +04:30
auto flush_tables = [ & ] {
2026-02-15 15:08:52 +01:00
auto merge_overlapping_ranges = [ ] ( auto & source ) {
2022-07-10 11:42:31 +04:30
Optional < CharRange > active_range ;
2026-02-15 15:08:52 +01:00
Vector < ByteCodeValueType > result ;
for ( auto & range : source ) {
2022-07-10 11:42:31 +04:30
if ( ! active_range . has_value ( ) ) {
2026-02-15 15:08:52 +01:00
active_range = CharRange ( range ) ;
2022-07-10 11:42:31 +04:30
continue ;
}
2026-02-15 15:08:52 +01:00
CharRange char_range ( range ) ;
if ( char_range . from < = active_range - > to + 1 & & char_range . to + 1 > = active_range - > from ) {
active_range = CharRange { min ( char_range . from , active_range - > from ) , max ( char_range . to , active_range - > to ) } ;
2022-07-10 11:42:31 +04:30
} else {
2026-02-15 15:08:52 +01:00
result . append ( active_range . release_value ( ) ) ;
active_range = char_range ;
2022-07-10 11:42:31 +04:30
}
}
2025-07-11 15:06:40 +02:00
if ( active_range . has_value ( ) )
2026-02-15 15:08:52 +01:00
result . append ( active_range . release_value ( ) ) ;
return result ;
} ;
auto append_table = [ & ] ( auto & table ) {
+ + argument_count ;
arguments . append ( to_underlying ( CharacterCompareType : : LookupTable ) ) ;
auto sensitive_size_index = arguments . size ( ) ;
auto insensitive_size_index = sensitive_size_index + 1 ;
arguments . append ( 0 ) ;
arguments . append ( 0 ) ;
auto range_data = merge_overlapping_ranges ( table ) ;
2025-07-11 15:06:40 +02:00
arguments . extend ( range_data ) ;
arguments [ sensitive_size_index ] = range_data . size ( ) ;
2026-02-15 15:08:52 +01:00
Vector < ByteCodeValueType > insensitive_data ;
for ( CharRange range : range_data ) {
for ( auto expanded : Unicode : : expand_range_case_insensitive ( range . from , range . to ) )
insensitive_data . append ( CharRange { expanded . from , expanded . to } ) ;
2022-07-10 11:42:31 +04:30
}
2026-02-15 15:08:52 +01:00
quick_sort ( insensitive_data , [ ] ( CharRange a , CharRange b ) { return a . from < b . from ; } ) ;
auto merged_data = merge_overlapping_ranges ( insensitive_data ) ;
arguments . extend ( merged_data ) ;
arguments [ insensitive_size_index ] = merged_data . size ( ) ;
2022-07-10 11:42:31 +04:30
} ;
auto contains_regular_table = ! table . is_empty ( ) ;
auto contains_inverted_table = ! inverted_table . is_empty ( ) ;
if ( contains_regular_table )
append_table ( table ) ;
if ( contains_inverted_table ) {
+ + argument_count ;
arguments . append ( to_underlying ( CharacterCompareType : : TemporaryInverse ) ) ;
append_table ( inverted_table ) ;
}
2022-07-20 23:22:07 +04:30
table . clear ( ) ;
inverted_table . clear ( ) ;
2022-07-10 11:42:31 +04:30
} ;
2022-07-20 23:22:07 +04:30
auto flush_on_every_insertion = false ;
2021-10-03 19:01:25 +03:30
for ( auto & value : pairs ) {
auto should_invert_after_this_iteration = invert_for_next_iteration ;
invert_for_next_iteration = false ;
auto insertion_result = insert_into_lookup_table ( * current_table , value ) ;
switch ( insertion_result ) {
case LookupTableInsertionOutcome : : Successful :
2022-07-20 23:22:07 +04:30
if ( flush_on_every_insertion )
flush_tables ( ) ;
2021-10-03 19:01:25 +03:30
break ;
case LookupTableInsertionOutcome : : ReplaceWithAnyChar : {
table . clear ( ) ;
inverted_table . clear ( ) ;
arguments . append ( to_underlying ( CharacterCompareType : : AnyChar ) ) ;
+ + argument_count ;
break ;
}
case LookupTableInsertionOutcome : : TemporaryInversionNeeded :
swap ( current_table , current_inverted_table ) ;
invert_for_next_iteration = true ;
is_currently_inverted = ! is_currently_inverted ;
break ;
case LookupTableInsertionOutcome : : PermanentInversionNeeded :
2022-07-10 11:42:31 +04:30
flush_tables ( ) ;
arguments . append ( to_underlying ( CharacterCompareType : : Inverse ) ) ;
+ + argument_count ;
2021-10-03 19:01:25 +03:30
break ;
2022-07-20 23:22:07 +04:30
case LookupTableInsertionOutcome : : FlushOnInsertion :
case LookupTableInsertionOutcome : : FinishFlushOnInsertion :
flush_tables ( ) ;
flush_on_every_insertion = insertion_result = = LookupTableInsertionOutcome : : FlushOnInsertion ;
[[fallthrough]] ;
2021-10-03 19:01:25 +03:30
case LookupTableInsertionOutcome : : CannotPlaceInTable :
if ( is_currently_inverted ) {
arguments . append ( to_underlying ( CharacterCompareType : : TemporaryInverse ) ) ;
+ + argument_count ;
}
arguments . append ( to_underlying ( value . type ) ) ;
2022-07-20 23:22:07 +04:30
if ( value . type ! = CharacterCompareType : : AnyChar
& & value . type ! = CharacterCompareType : : TemporaryInverse
& & value . type ! = CharacterCompareType : : Inverse
& & value . type ! = CharacterCompareType : : And
& & value . type ! = CharacterCompareType : : Or
2025-11-09 13:35:16 +01:00
& & value . type ! = CharacterCompareType : : Subtract
2022-07-20 23:22:07 +04:30
& & value . type ! = CharacterCompareType : : EndAndOr )
arguments . append ( value . value ) ;
2021-10-03 19:01:25 +03:30
+ + argument_count ;
break ;
}
if ( should_invert_after_this_iteration ) {
swap ( current_table , current_inverted_table ) ;
is_currently_inverted = ! is_currently_inverted ;
}
}
2022-07-10 11:42:31 +04:30
flush_tables ( ) ;
2021-10-03 19:01:25 +03:30
}
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : Compare ) ) ;
target . empend ( argument_count ) ; // number of arguments
target . empend ( arguments . size ( ) ) ; // size of arguments
target . extend ( move ( arguments ) ) ;
}
2021-09-12 17:30:27 +04:30
template void Regex < PosixBasicParser > : : run_optimization_passes ( ) ;
template void Regex < PosixExtendedParser > : : run_optimization_passes ( ) ;
template void Regex < ECMA262Parser > : : run_optimization_passes ( ) ;
2025-05-13 07:06:33 -04:00
2021-09-12 17:30:27 +04:30
}