ladybird/AK/GenericLexer.h
Timothy Flynn 99d7e08dff AK: Templatize GenericLexer for UTF-16 strings
We now define GenericLexer as a template to allow using it with UTF-16
strings. To keep existing users happy, the template is defined in the
Detail namespace. Then AK::GenericLexer is an alias for a char-based
view, and AK::Utf16GenericLexer is an alias for a char16-based view.
2025-08-13 09:56:13 -04:00

528 lines
14 KiB
C++

/*
* Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Assertions.h>
#include <AK/Forward.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/RedBlackTree.h>
#include <AK/Result.h>
#include <AK/ScopeGuard.h>
#include <AK/StringView.h>
#include <AK/Utf16View.h>
namespace AK {
constexpr auto is_any_of(StringView values)
{
return [values](auto c) { return values.contains(c); };
}
constexpr auto is_not_any_of(StringView values)
{
return [values](auto c) { return !values.contains(c); };
}
constexpr auto is_path_separator = is_any_of("/\\"sv);
constexpr auto is_quote = is_any_of("'\""sv);
enum class UnicodeEscapeError {
MalformedUnicodeEscape,
UnicodeEscapeOverflow,
};
namespace Detail {
template<typename CharType>
class GenericLexer {
static_assert(IsOneOf<CharType, char, char16_t>);
public:
using ViewType = Detail::Conditional<IsSame<CharType, char>, StringView, Utf16View>;
constexpr explicit GenericLexer(ViewType input)
: m_input(input)
{
}
constexpr size_t tell() const { return m_index; }
constexpr size_t tell_remaining() const { return input_length() - m_index; }
constexpr ViewType remaining() const { return m_input.substring_view(m_index); }
constexpr ViewType input() const { return m_input; }
constexpr bool is_eof() const { return m_index >= input_length(); }
constexpr CharType peek(size_t offset = 0) const
{
return (m_index + offset < input_length()) ? code_unit_at(m_index + offset) : '\0';
}
constexpr Optional<ViewType> peek_string(size_t length, size_t offset = 0) const
{
if (m_index + offset + length > input_length())
return {};
return m_input.substring_view(m_index + offset, length);
}
constexpr bool next_is(CharType expected) const
{
return peek() == expected;
}
constexpr bool next_is(char expected) const
requires(IsSame<CharType, char16_t>)
{
return peek() == expected;
}
constexpr bool next_is(ViewType expected) const
{
size_t length = 0;
if constexpr (IsSame<CharType, char16_t>)
length = expected.length_in_code_units();
else
length = expected.length();
return peek_string(length) == expected;
}
constexpr bool next_is(StringView expected) const
requires(IsSame<CharType, char16_t>)
{
return peek_string(expected.length()) == expected;
}
constexpr void retreat()
{
VERIFY(m_index > 0);
--m_index;
}
constexpr void retreat(size_t count)
{
VERIFY(m_index >= count);
m_index -= count;
}
constexpr CharType consume()
{
VERIFY(!is_eof());
return code_unit_at(m_index++);
}
constexpr bool consume_specific(CharType next)
{
if (!next_is(next))
return false;
ignore();
return true;
}
constexpr bool consume_specific(char next)
requires(IsSame<CharType, char16_t>)
{
return consume_specific(static_cast<char16_t>(next));
}
constexpr bool consume_specific(ViewType next)
{
if (!next_is(next))
return false;
if constexpr (IsSame<CharType, char16_t>)
ignore(next.length_in_code_units());
else
ignore(next.length());
return true;
}
constexpr bool consume_specific(StringView next)
requires(IsSame<CharType, char16_t>)
{
if (!next_is(next))
return false;
ignore(next.length());
return true;
}
constexpr CharType consume_escaped_character(CharType escape_char = '\\', StringView escape_map = "n\nr\rt\tb\bf\f"sv)
{
if (!consume_specific(escape_char))
return consume();
auto c = consume();
for (size_t i = 0; i < escape_map.length(); i += 2) {
if (c == escape_map[i])
return escape_map[i + 1];
}
return c;
}
// Consume a number of characters
constexpr ViewType consume(size_t count)
{
auto start = m_index;
auto length = min(count, input_length() - m_index);
m_index += length;
return m_input.substring_view(start, length);
}
// Consume the rest of the input
constexpr ViewType consume_all()
{
auto rest = m_input.substring_view(m_index, input_length() - m_index);
m_index = input_length();
return rest;
}
// Consume until a new line is found
constexpr ViewType consume_line()
{
auto start = m_index;
while (!is_eof() && peek() != '\r' && peek() != '\n')
m_index++;
auto length = m_index - start;
consume_specific('\r');
consume_specific('\n');
return m_input.substring_view(start, length);
}
// Consume and return characters until `stop` is peeked
constexpr ViewType consume_until(CharType stop)
{
auto start = m_index;
while (!is_eof() && peek() != stop)
m_index++;
auto length = m_index - start;
return m_input.substring_view(start, length);
}
constexpr ViewType consume_until(char stop)
requires(IsSame<CharType, char16_t>)
{
return consume_until(static_cast<char16_t>(stop));
}
// Consume and return characters until the string `stop` is found
constexpr ViewType consume_until(ViewType stop)
{
auto start = m_index;
while (!is_eof() && !next_is(stop))
m_index++;
auto length = m_index - start;
return m_input.substring_view(start, length);
}
// Consume a string surrounded by single or double quotes. The returned ViewType does not include the quotes. An
// escape character can be provided to capture the enclosing quotes. Please note that the escape character will
// still be in the resulting ViewType.
constexpr ViewType consume_quoted_string(CharType escape_char = 0)
{
if (!next_is(is_quote))
return {};
auto quote_char = consume();
auto start = m_index;
while (!is_eof()) {
if (next_is(escape_char))
m_index++;
else if (next_is(quote_char))
break;
m_index++;
}
auto length = m_index - start;
if (peek() != quote_char) {
// Restore the index in case the string is unterminated
m_index = start - 1;
return {};
}
// Ignore closing quote
ignore();
return m_input.substring_view(start, length);
}
template<Integral T>
ErrorOr<T> consume_decimal_integer()
{
using UnsignedT = MakeUnsigned<T>;
ArmedScopeGuard rollback { [&, rollback_position = m_index]() {
m_index = rollback_position;
} };
bool has_minus_sign = false;
if (next_is('+') || next_is('-'))
if (consume() == '-')
has_minus_sign = true;
auto number_view = consume_while(is_ascii_digit);
if (number_view.is_empty())
return Error::from_errno(EINVAL);
auto maybe_number = number_view.template to_number<UnsignedT>(TrimWhitespace::No);
if (!maybe_number.has_value())
return Error::from_errno(ERANGE);
auto number = maybe_number.value();
if (!has_minus_sign) {
if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
return Error::from_errno(ERANGE);
rollback.disarm();
return number;
}
if constexpr (IsUnsigned<T>) {
if (number != 0)
return Error::from_errno(ERANGE);
rollback.disarm();
return 0;
} else {
static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
if (number > max_value)
return Error::from_errno(ERANGE);
rollback.disarm();
return -number;
}
}
Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true)
{
if (!consume_specific("\\u"sv))
return UnicodeEscapeError::MalformedUnicodeEscape;
if (next_is('{'))
return decode_code_point();
return decode_single_or_paired_surrogate(combine_surrogate_pairs);
}
constexpr void ignore(size_t count = 1)
{
count = min(count, input_length() - m_index);
m_index += count;
}
constexpr void ignore_until(CharType stop)
{
while (!is_eof() && peek() != stop)
++m_index;
}
constexpr void ignore_until(char stop)
requires(IsSame<CharType, char16_t>)
{
return ignore_until(static_cast<char16_t>(stop));
}
// Conditions are used to match arbitrary characters. You can use lambdas, ctype functions, or is_any_of() and its
// derivatives (see below).
//
// A few examples:
// - `if (lexer.next_is(isdigit))`
// - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });`
// - `lexer.ignore_until(is_any_of("<^>"));`
// Test the next character against a Condition
template<typename TPredicate>
constexpr bool next_is(TPredicate pred) const
{
return pred(peek());
}
// Consume and return characters while `pred` returns true
template<typename TPredicate>
constexpr ViewType consume_while(TPredicate pred)
{
auto start = m_index;
while (!is_eof() && pred(peek()))
++m_index;
auto length = m_index - start;
return m_input.substring_view(start, length);
}
// Consume and return characters until `pred` return true
template<typename TPredicate>
constexpr ViewType consume_until(TPredicate pred)
{
auto start = m_index;
while (!is_eof() && !pred(peek()))
++m_index;
auto length = m_index - start;
return m_input.substring_view(start, length);
}
template<typename TPredicate>
constexpr bool consume_specific_with_predicate(TPredicate pred)
{
if (is_eof() || !pred(peek()))
return false;
ignore();
return true;
}
// Ignore characters while `pred` returns true
template<typename TPredicate>
constexpr void ignore_while(TPredicate pred)
{
while (!is_eof() && pred(peek()))
++m_index;
}
// Ignore characters until `pred` returns true
template<typename TPredicate>
constexpr void ignore_until(TPredicate pred)
{
while (!is_eof() && !pred(peek()))
++m_index;
}
protected:
Result<u32, UnicodeEscapeError> decode_code_point()
{
bool starts_with_open_bracket = consume_specific('{');
VERIFY(starts_with_open_bracket);
u32 code_point = 0;
while (true) {
if (!next_is(is_ascii_hex_digit))
return UnicodeEscapeError::MalformedUnicodeEscape;
auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
if (new_code_point < code_point)
return UnicodeEscapeError::UnicodeEscapeOverflow;
code_point = new_code_point;
if (consume_specific('}'))
break;
}
if (is_unicode(code_point))
return code_point;
return UnicodeEscapeError::UnicodeEscapeOverflow;
}
Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs = true)
{
constexpr size_t surrogate_length = 4;
auto decode_one_surrogate = [&]() -> Optional<u16> {
u16 surrogate = 0;
for (size_t i = 0; i < surrogate_length; ++i) {
if (!next_is(is_ascii_hex_digit))
return {};
surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
}
return surrogate;
};
auto high_surrogate = decode_one_surrogate();
if (!high_surrogate.has_value())
return UnicodeEscapeError::MalformedUnicodeEscape;
if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate))
return *high_surrogate;
if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
return *high_surrogate;
auto low_surrogate = decode_one_surrogate();
if (!low_surrogate.has_value())
return UnicodeEscapeError::MalformedUnicodeEscape;
if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate))
return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate);
retreat(6);
return *high_surrogate;
}
constexpr size_t input_length() const
{
if constexpr (IsSame<CharType, char16_t>)
return m_input.length_in_code_units();
else
return m_input.length();
}
constexpr CharType code_unit_at(size_t index) const
{
if constexpr (IsSame<CharType, char16_t>)
return m_input.code_unit_at(index);
else
return m_input[index];
}
ViewType m_input;
size_t m_index { 0 };
};
}
class LineTrackingLexer : public GenericLexer {
public:
struct Position {
size_t offset { 0 };
size_t line { 0 };
size_t column { 0 };
};
LineTrackingLexer(StringView input, Position start_position)
: GenericLexer(input)
, m_first_line_start_position(start_position)
, m_line_start_positions(make<RedBlackTree<size_t, size_t>>())
{
m_line_start_positions->insert(0, 0);
auto first_newline = input.find('\n').map([](auto x) { return x + 1; }).value_or(input.length());
m_line_start_positions->insert(first_newline, 1);
m_largest_known_line_start_position = first_newline;
}
LineTrackingLexer(StringView input)
: LineTrackingLexer(input, { 0, 1, 1 })
{
}
Position position_for(size_t) const;
Position current_position() const { return position_for(m_index); }
protected:
Position m_first_line_start_position;
mutable NonnullOwnPtr<RedBlackTree<size_t, size_t>> m_line_start_positions; // offset -> line index
mutable size_t m_largest_known_line_start_position { 0 };
};
}
#if USING_AK_GLOBALLY
using AK::GenericLexer;
using AK::is_any_of;
using AK::is_path_separator;
using AK::is_quote;
using AK::LineTrackingLexer;
#endif