mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-10-19 07:33:20 +00:00

We now define GenericLexer as a template to allow using it with UTF-16 strings. To keep existing users happy, the template is defined in the Detail namespace. Then AK::GenericLexer is an alias for a char-based view, and AK::Utf16GenericLexer is an alias for a char16-based view.
528 lines
14 KiB
C++
528 lines
14 KiB
C++
/*
|
|
* Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <AK/Assertions.h>
|
|
#include <AK/Forward.h>
|
|
#include <AK/NonnullOwnPtr.h>
|
|
#include <AK/RedBlackTree.h>
|
|
#include <AK/Result.h>
|
|
#include <AK/ScopeGuard.h>
|
|
#include <AK/StringView.h>
|
|
#include <AK/Utf16View.h>
|
|
|
|
namespace AK {
|
|
|
|
constexpr auto is_any_of(StringView values)
|
|
{
|
|
return [values](auto c) { return values.contains(c); };
|
|
}
|
|
|
|
constexpr auto is_not_any_of(StringView values)
|
|
{
|
|
return [values](auto c) { return !values.contains(c); };
|
|
}
|
|
|
|
constexpr auto is_path_separator = is_any_of("/\\"sv);
|
|
constexpr auto is_quote = is_any_of("'\""sv);
|
|
|
|
enum class UnicodeEscapeError {
|
|
MalformedUnicodeEscape,
|
|
UnicodeEscapeOverflow,
|
|
};
|
|
|
|
namespace Detail {
|
|
|
|
template<typename CharType>
|
|
class GenericLexer {
|
|
static_assert(IsOneOf<CharType, char, char16_t>);
|
|
|
|
public:
|
|
using ViewType = Detail::Conditional<IsSame<CharType, char>, StringView, Utf16View>;
|
|
|
|
constexpr explicit GenericLexer(ViewType input)
|
|
: m_input(input)
|
|
{
|
|
}
|
|
|
|
constexpr size_t tell() const { return m_index; }
|
|
constexpr size_t tell_remaining() const { return input_length() - m_index; }
|
|
|
|
constexpr ViewType remaining() const { return m_input.substring_view(m_index); }
|
|
constexpr ViewType input() const { return m_input; }
|
|
|
|
constexpr bool is_eof() const { return m_index >= input_length(); }
|
|
|
|
constexpr CharType peek(size_t offset = 0) const
|
|
{
|
|
return (m_index + offset < input_length()) ? code_unit_at(m_index + offset) : '\0';
|
|
}
|
|
|
|
constexpr Optional<ViewType> peek_string(size_t length, size_t offset = 0) const
|
|
{
|
|
if (m_index + offset + length > input_length())
|
|
return {};
|
|
return m_input.substring_view(m_index + offset, length);
|
|
}
|
|
|
|
constexpr bool next_is(CharType expected) const
|
|
{
|
|
return peek() == expected;
|
|
}
|
|
|
|
constexpr bool next_is(char expected) const
|
|
requires(IsSame<CharType, char16_t>)
|
|
{
|
|
return peek() == expected;
|
|
}
|
|
|
|
constexpr bool next_is(ViewType expected) const
|
|
{
|
|
size_t length = 0;
|
|
|
|
if constexpr (IsSame<CharType, char16_t>)
|
|
length = expected.length_in_code_units();
|
|
else
|
|
length = expected.length();
|
|
|
|
return peek_string(length) == expected;
|
|
}
|
|
|
|
constexpr bool next_is(StringView expected) const
|
|
requires(IsSame<CharType, char16_t>)
|
|
{
|
|
return peek_string(expected.length()) == expected;
|
|
}
|
|
|
|
constexpr void retreat()
|
|
{
|
|
VERIFY(m_index > 0);
|
|
--m_index;
|
|
}
|
|
|
|
constexpr void retreat(size_t count)
|
|
{
|
|
VERIFY(m_index >= count);
|
|
m_index -= count;
|
|
}
|
|
|
|
constexpr CharType consume()
|
|
{
|
|
VERIFY(!is_eof());
|
|
return code_unit_at(m_index++);
|
|
}
|
|
|
|
constexpr bool consume_specific(CharType next)
|
|
{
|
|
if (!next_is(next))
|
|
return false;
|
|
|
|
ignore();
|
|
return true;
|
|
}
|
|
|
|
constexpr bool consume_specific(char next)
|
|
requires(IsSame<CharType, char16_t>)
|
|
{
|
|
return consume_specific(static_cast<char16_t>(next));
|
|
}
|
|
|
|
constexpr bool consume_specific(ViewType next)
|
|
{
|
|
if (!next_is(next))
|
|
return false;
|
|
|
|
if constexpr (IsSame<CharType, char16_t>)
|
|
ignore(next.length_in_code_units());
|
|
else
|
|
ignore(next.length());
|
|
|
|
return true;
|
|
}
|
|
|
|
constexpr bool consume_specific(StringView next)
|
|
requires(IsSame<CharType, char16_t>)
|
|
{
|
|
if (!next_is(next))
|
|
return false;
|
|
|
|
ignore(next.length());
|
|
return true;
|
|
}
|
|
|
|
constexpr CharType consume_escaped_character(CharType escape_char = '\\', StringView escape_map = "n\nr\rt\tb\bf\f"sv)
|
|
{
|
|
if (!consume_specific(escape_char))
|
|
return consume();
|
|
|
|
auto c = consume();
|
|
|
|
for (size_t i = 0; i < escape_map.length(); i += 2) {
|
|
if (c == escape_map[i])
|
|
return escape_map[i + 1];
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
// Consume a number of characters
|
|
constexpr ViewType consume(size_t count)
|
|
{
|
|
auto start = m_index;
|
|
auto length = min(count, input_length() - m_index);
|
|
m_index += length;
|
|
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
// Consume the rest of the input
|
|
constexpr ViewType consume_all()
|
|
{
|
|
auto rest = m_input.substring_view(m_index, input_length() - m_index);
|
|
m_index = input_length();
|
|
return rest;
|
|
}
|
|
|
|
// Consume until a new line is found
|
|
constexpr ViewType consume_line()
|
|
{
|
|
auto start = m_index;
|
|
while (!is_eof() && peek() != '\r' && peek() != '\n')
|
|
m_index++;
|
|
|
|
auto length = m_index - start;
|
|
consume_specific('\r');
|
|
consume_specific('\n');
|
|
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
// Consume and return characters until `stop` is peeked
|
|
constexpr ViewType consume_until(CharType stop)
|
|
{
|
|
auto start = m_index;
|
|
while (!is_eof() && peek() != stop)
|
|
m_index++;
|
|
|
|
auto length = m_index - start;
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
constexpr ViewType consume_until(char stop)
|
|
requires(IsSame<CharType, char16_t>)
|
|
{
|
|
return consume_until(static_cast<char16_t>(stop));
|
|
}
|
|
|
|
// Consume and return characters until the string `stop` is found
|
|
constexpr ViewType consume_until(ViewType stop)
|
|
{
|
|
auto start = m_index;
|
|
while (!is_eof() && !next_is(stop))
|
|
m_index++;
|
|
|
|
auto length = m_index - start;
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
// Consume a string surrounded by single or double quotes. The returned ViewType does not include the quotes. An
|
|
// escape character can be provided to capture the enclosing quotes. Please note that the escape character will
|
|
// still be in the resulting ViewType.
|
|
constexpr ViewType consume_quoted_string(CharType escape_char = 0)
|
|
{
|
|
if (!next_is(is_quote))
|
|
return {};
|
|
|
|
auto quote_char = consume();
|
|
auto start = m_index;
|
|
while (!is_eof()) {
|
|
if (next_is(escape_char))
|
|
m_index++;
|
|
else if (next_is(quote_char))
|
|
break;
|
|
m_index++;
|
|
}
|
|
auto length = m_index - start;
|
|
|
|
if (peek() != quote_char) {
|
|
// Restore the index in case the string is unterminated
|
|
m_index = start - 1;
|
|
return {};
|
|
}
|
|
|
|
// Ignore closing quote
|
|
ignore();
|
|
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
template<Integral T>
|
|
ErrorOr<T> consume_decimal_integer()
|
|
{
|
|
using UnsignedT = MakeUnsigned<T>;
|
|
|
|
ArmedScopeGuard rollback { [&, rollback_position = m_index]() {
|
|
m_index = rollback_position;
|
|
} };
|
|
|
|
bool has_minus_sign = false;
|
|
|
|
if (next_is('+') || next_is('-'))
|
|
if (consume() == '-')
|
|
has_minus_sign = true;
|
|
|
|
auto number_view = consume_while(is_ascii_digit);
|
|
if (number_view.is_empty())
|
|
return Error::from_errno(EINVAL);
|
|
|
|
auto maybe_number = number_view.template to_number<UnsignedT>(TrimWhitespace::No);
|
|
if (!maybe_number.has_value())
|
|
return Error::from_errno(ERANGE);
|
|
auto number = maybe_number.value();
|
|
|
|
if (!has_minus_sign) {
|
|
if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
|
|
return Error::from_errno(ERANGE);
|
|
|
|
rollback.disarm();
|
|
return number;
|
|
}
|
|
|
|
if constexpr (IsUnsigned<T>) {
|
|
if (number != 0)
|
|
return Error::from_errno(ERANGE);
|
|
|
|
rollback.disarm();
|
|
return 0;
|
|
} else {
|
|
static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
|
|
if (number > max_value)
|
|
return Error::from_errno(ERANGE);
|
|
|
|
rollback.disarm();
|
|
return -number;
|
|
}
|
|
}
|
|
|
|
Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true)
|
|
{
|
|
if (!consume_specific("\\u"sv))
|
|
return UnicodeEscapeError::MalformedUnicodeEscape;
|
|
|
|
if (next_is('{'))
|
|
return decode_code_point();
|
|
return decode_single_or_paired_surrogate(combine_surrogate_pairs);
|
|
}
|
|
|
|
constexpr void ignore(size_t count = 1)
|
|
{
|
|
count = min(count, input_length() - m_index);
|
|
m_index += count;
|
|
}
|
|
|
|
constexpr void ignore_until(CharType stop)
|
|
{
|
|
while (!is_eof() && peek() != stop)
|
|
++m_index;
|
|
}
|
|
|
|
constexpr void ignore_until(char stop)
|
|
requires(IsSame<CharType, char16_t>)
|
|
{
|
|
return ignore_until(static_cast<char16_t>(stop));
|
|
}
|
|
|
|
// Conditions are used to match arbitrary characters. You can use lambdas, ctype functions, or is_any_of() and its
|
|
// derivatives (see below).
|
|
//
|
|
// A few examples:
|
|
// - `if (lexer.next_is(isdigit))`
|
|
// - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });`
|
|
// - `lexer.ignore_until(is_any_of("<^>"));`
|
|
|
|
// Test the next character against a Condition
|
|
template<typename TPredicate>
|
|
constexpr bool next_is(TPredicate pred) const
|
|
{
|
|
return pred(peek());
|
|
}
|
|
|
|
// Consume and return characters while `pred` returns true
|
|
template<typename TPredicate>
|
|
constexpr ViewType consume_while(TPredicate pred)
|
|
{
|
|
auto start = m_index;
|
|
while (!is_eof() && pred(peek()))
|
|
++m_index;
|
|
|
|
auto length = m_index - start;
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
// Consume and return characters until `pred` return true
|
|
template<typename TPredicate>
|
|
constexpr ViewType consume_until(TPredicate pred)
|
|
{
|
|
auto start = m_index;
|
|
while (!is_eof() && !pred(peek()))
|
|
++m_index;
|
|
|
|
auto length = m_index - start;
|
|
return m_input.substring_view(start, length);
|
|
}
|
|
|
|
template<typename TPredicate>
|
|
constexpr bool consume_specific_with_predicate(TPredicate pred)
|
|
{
|
|
if (is_eof() || !pred(peek()))
|
|
return false;
|
|
|
|
ignore();
|
|
return true;
|
|
}
|
|
|
|
// Ignore characters while `pred` returns true
|
|
template<typename TPredicate>
|
|
constexpr void ignore_while(TPredicate pred)
|
|
{
|
|
while (!is_eof() && pred(peek()))
|
|
++m_index;
|
|
}
|
|
|
|
// Ignore characters until `pred` returns true
|
|
template<typename TPredicate>
|
|
constexpr void ignore_until(TPredicate pred)
|
|
{
|
|
while (!is_eof() && !pred(peek()))
|
|
++m_index;
|
|
}
|
|
|
|
protected:
|
|
Result<u32, UnicodeEscapeError> decode_code_point()
|
|
{
|
|
bool starts_with_open_bracket = consume_specific('{');
|
|
VERIFY(starts_with_open_bracket);
|
|
|
|
u32 code_point = 0;
|
|
|
|
while (true) {
|
|
if (!next_is(is_ascii_hex_digit))
|
|
return UnicodeEscapeError::MalformedUnicodeEscape;
|
|
|
|
auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
|
|
if (new_code_point < code_point)
|
|
return UnicodeEscapeError::UnicodeEscapeOverflow;
|
|
|
|
code_point = new_code_point;
|
|
if (consume_specific('}'))
|
|
break;
|
|
}
|
|
|
|
if (is_unicode(code_point))
|
|
return code_point;
|
|
return UnicodeEscapeError::UnicodeEscapeOverflow;
|
|
}
|
|
|
|
Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs = true)
|
|
{
|
|
constexpr size_t surrogate_length = 4;
|
|
|
|
auto decode_one_surrogate = [&]() -> Optional<u16> {
|
|
u16 surrogate = 0;
|
|
|
|
for (size_t i = 0; i < surrogate_length; ++i) {
|
|
if (!next_is(is_ascii_hex_digit))
|
|
return {};
|
|
|
|
surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
|
|
}
|
|
|
|
return surrogate;
|
|
};
|
|
|
|
auto high_surrogate = decode_one_surrogate();
|
|
if (!high_surrogate.has_value())
|
|
return UnicodeEscapeError::MalformedUnicodeEscape;
|
|
if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate))
|
|
return *high_surrogate;
|
|
if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
|
|
return *high_surrogate;
|
|
|
|
auto low_surrogate = decode_one_surrogate();
|
|
if (!low_surrogate.has_value())
|
|
return UnicodeEscapeError::MalformedUnicodeEscape;
|
|
if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate))
|
|
return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate);
|
|
|
|
retreat(6);
|
|
return *high_surrogate;
|
|
}
|
|
|
|
constexpr size_t input_length() const
|
|
{
|
|
if constexpr (IsSame<CharType, char16_t>)
|
|
return m_input.length_in_code_units();
|
|
else
|
|
return m_input.length();
|
|
}
|
|
|
|
constexpr CharType code_unit_at(size_t index) const
|
|
{
|
|
if constexpr (IsSame<CharType, char16_t>)
|
|
return m_input.code_unit_at(index);
|
|
else
|
|
return m_input[index];
|
|
}
|
|
|
|
ViewType m_input;
|
|
size_t m_index { 0 };
|
|
};
|
|
|
|
}
|
|
|
|
class LineTrackingLexer : public GenericLexer {
|
|
public:
|
|
struct Position {
|
|
size_t offset { 0 };
|
|
size_t line { 0 };
|
|
size_t column { 0 };
|
|
};
|
|
|
|
LineTrackingLexer(StringView input, Position start_position)
|
|
: GenericLexer(input)
|
|
, m_first_line_start_position(start_position)
|
|
, m_line_start_positions(make<RedBlackTree<size_t, size_t>>())
|
|
{
|
|
m_line_start_positions->insert(0, 0);
|
|
auto first_newline = input.find('\n').map([](auto x) { return x + 1; }).value_or(input.length());
|
|
m_line_start_positions->insert(first_newline, 1);
|
|
m_largest_known_line_start_position = first_newline;
|
|
}
|
|
|
|
LineTrackingLexer(StringView input)
|
|
: LineTrackingLexer(input, { 0, 1, 1 })
|
|
{
|
|
}
|
|
|
|
Position position_for(size_t) const;
|
|
Position current_position() const { return position_for(m_index); }
|
|
|
|
protected:
|
|
Position m_first_line_start_position;
|
|
mutable NonnullOwnPtr<RedBlackTree<size_t, size_t>> m_line_start_positions; // offset -> line index
|
|
mutable size_t m_largest_known_line_start_position { 0 };
|
|
};
|
|
|
|
}
|
|
|
|
#if USING_AK_GLOBALLY
|
|
using AK::GenericLexer;
|
|
using AK::is_any_of;
|
|
using AK::is_path_separator;
|
|
using AK::is_quote;
|
|
using AK::LineTrackingLexer;
|
|
#endif
|