/* * Copyright (c) 2020, Andreas Kling * Copyright (c) 2022, Linus Groh * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include namespace Web::HTML { static Vector code_points_from_string(String const& string) { Vector code_points; code_points.ensure_capacity(string.bytes().size()); for (auto code_point : string.code_points()) code_points.append(code_point); return code_points; } static RustFfiTokenizerHandle* create_tokenizer_from_utf8(StringView utf8_bytes) { auto* bytes = reinterpret_cast(utf8_bytes.characters_without_null_termination()); if (bytes == nullptr) bytes = reinterpret_cast(""); return rust_html_tokenizer_create_from_utf8(bytes, utf8_bytes.length()); } static String decoded_string_for_utf8_tokenizer(StringView input) { Utf8View utf8_view { input }; if (utf8_view.validate(AllowLonelySurrogates::No)) return String::from_utf8_without_validation(input.bytes()); // Decoded strings may come from WTF-16 JS strings. Rust's UTF-8 path // requires scalar-value UTF-8, so replace lone surrogates but keep BOMs. VERIFY(utf8_view.validate()); StringBuilder builder(input.length()); for (auto code_point : utf8_view) builder.append_code_point(is_unicode_surrogate(code_point) ? AK::UnicodeUtils::REPLACEMENT_CODE_POINT : code_point); return builder.to_string_without_validation(); } static Vector build_interned_name_table(size_t count, void (*fetch)(uint16_t, uint8_t const**, size_t*)) { Vector table; // Slot 0 is unused (id 0 means "not interned"); store an empty FlyString there. table.append(FlyString {}); table.ensure_capacity(count + 1); for (size_t i = 0; i < count; ++i) { uint8_t const* ptr = nullptr; size_t len = 0; fetch(static_cast(i + 1), &ptr, &len); if (ptr == nullptr || len == 0) { table.append(FlyString {}); continue; } table.append(MUST(FlyString::from_utf8(StringView { ptr, len }))); } return table; } static FlyString const& interned_rust_tag_name(uint16_t id) { static NeverDestroyed> table { build_interned_name_table( rust_html_tokenizer_interned_tag_name_count(), rust_html_tokenizer_interned_tag_name) }; if (id == 0 || id >= table->size()) return (*table)[0]; return (*table)[id]; } static FlyString const& interned_rust_attr_name(uint16_t id) { static NeverDestroyed> table { build_interned_name_table( rust_html_tokenizer_interned_attr_name_count(), rust_html_tokenizer_interned_attr_name) }; if (id == 0 || id >= table->size()) return (*table)[0]; return (*table)[id]; } HTMLTokenizer::HTMLTokenizer() { m_tokenizer = create_tokenizer_from_utf8({}); rust_html_tokenizer_set_input_stream_closed(m_tokenizer, false); } HTMLTokenizer::~HTMLTokenizer() { if (m_tokenizer) rust_html_tokenizer_destroy(m_tokenizer); } HTMLTokenizer::HTMLTokenizer(StringView input, ByteString const& encoding, InputType input_type) { if (input_type == InputType::EncodedBytes) { auto decoder = TextCodec::decoder_for(encoding); VERIFY(decoder.has_value()); m_source = MUST(decoder->to_utf8(input)); } else { m_source = decoded_string_for_utf8_tokenizer(input); } m_input_stream_closed = true; m_tokenizer = create_tokenizer_from_utf8(m_source.bytes_as_string_view()); } Optional HTMLTokenizer::next_token(StopAtInsertionPoint stop_at_insertion_point) { RustFfiToken ffi; bool stop = stop_at_insertion_point == StopAtInsertionPoint::Yes; bool cdata_allowed = false; if (!rust_html_tokenizer_next_token(m_tokenizer, &ffi, stop, cdata_allowed)) return {}; HTMLToken::Type type; switch (ffi.token_type) { case 1: type = HTMLToken::Type::DOCTYPE; break; case 2: type = HTMLToken::Type::StartTag; break; case 3: type = HTMLToken::Type::EndTag; break; case 4: type = HTMLToken::Type::Comment; break; case 5: type = HTMLToken::Type::Character; break; case 6: type = HTMLToken::Type::EndOfFile; break; default: VERIFY_NOT_REACHED(); } HTMLToken token { type }; token.set_start_position({}, { ffi.start_line, ffi.start_column }); token.set_end_position({}, { ffi.end_line, ffi.end_column }); switch (type) { case HTMLToken::Type::Character: token.set_code_point(ffi.code_point); break; case HTMLToken::Type::StartTag: case HTMLToken::Type::EndTag: { if (ffi.tag_name_id != 0) token.set_tag_name(interned_rust_tag_name(ffi.tag_name_id)); else token.set_tag_name(MUST(FlyString::from_utf8(ffi_string_view(ffi.tag_name_ptr, ffi.tag_name_len)))); token.set_self_closing(ffi.self_closing); for (size_t i = 0; i < ffi.attributes_len; ++i) { auto const& ffi_attribute = ffi.attributes_ptr[i]; HTMLToken::Attribute attribute; if (ffi_attribute.name_id != 0) attribute.local_name = interned_rust_attr_name(ffi_attribute.name_id); else attribute.local_name = MUST(FlyString::from_utf8(ffi_string_view(ffi_attribute.name_ptr, ffi_attribute.name_len))); attribute.value = MUST(String::from_utf8(ffi_string_view(ffi_attribute.value_ptr, ffi_attribute.value_len))); attribute.name_start_position = { ffi_attribute.name_start_line, ffi_attribute.name_start_column }; attribute.name_end_position = { ffi_attribute.name_end_line, ffi_attribute.name_end_column }; attribute.value_start_position = { ffi_attribute.value_start_line, ffi_attribute.value_start_column }; attribute.value_end_position = { ffi_attribute.value_end_line, ffi_attribute.value_end_column }; token.add_attribute(move(attribute)); } token.normalize_attributes(); if (ffi.had_duplicate_attribute) token.set_had_duplicate_attribute({}); break; } case HTMLToken::Type::Comment: token.set_comment(MUST(String::from_utf8(ffi_string_view(ffi.comment_ptr, ffi.comment_len)))); break; case HTMLToken::Type::DOCTYPE: { auto& doctype = token.ensure_doctype_data(); if (!ffi.missing_name) { doctype.name = MUST(String::from_utf8(ffi_string_view(ffi.doctype_name_ptr, ffi.doctype_name_len))); doctype.missing_name = false; } if (!ffi.missing_public_id) { doctype.public_identifier = MUST(String::from_utf8(ffi_string_view(ffi.public_id_ptr, ffi.public_id_len))); doctype.missing_public_identifier = false; } if (!ffi.missing_system_id) { doctype.system_identifier = MUST(String::from_utf8(ffi_string_view(ffi.system_id_ptr, ffi.system_id_len))); doctype.missing_system_identifier = false; } doctype.force_quirks = ffi.force_quirks; break; } case HTMLToken::Type::EndOfFile: break; case HTMLToken::Type::Invalid: VERIFY_NOT_REACHED(); } return token; } void HTMLTokenizer::parser_did_run(Badge) { rust_html_tokenizer_parser_did_run(m_tokenizer); } String HTMLTokenizer::unparsed_input() const { uint8_t const* ptr = nullptr; size_t len = 0; rust_html_tokenizer_unparsed_input(m_tokenizer, &ptr, &len); return MUST(String::from_utf8(ffi_string_view(ptr, len))); } void HTMLTokenizer::append_to_input_stream(StringView input) { if (input.is_empty()) return; auto utf8_input = MUST(String::from_utf8(input)); auto code_points = code_points_from_string(utf8_input); rust_html_tokenizer_append_input(m_tokenizer, code_points.data(), code_points.size()); } void HTMLTokenizer::close_input_stream() { m_input_stream_closed = true; rust_html_tokenizer_set_input_stream_closed(m_tokenizer, true); } void HTMLTokenizer::insert_input_at_insertion_point(StringView input) { auto utf8_input = MUST(String::from_utf8(input)); auto code_points = code_points_from_string(utf8_input); rust_html_tokenizer_insert_input(m_tokenizer, code_points.data(), code_points.size()); } void HTMLTokenizer::insert_eof() { close_input_stream(); rust_html_tokenizer_insert_eof(m_tokenizer); } bool HTMLTokenizer::is_insertion_point_defined() const { return rust_html_tokenizer_is_insertion_point_defined(m_tokenizer); } bool HTMLTokenizer::is_insertion_point_reached() { return rust_html_tokenizer_is_insertion_point_reached(m_tokenizer); } void HTMLTokenizer::undefine_insertion_point() { rust_html_tokenizer_undefine_insertion_point(m_tokenizer); } void HTMLTokenizer::store_insertion_point() { rust_html_tokenizer_store_insertion_point(m_tokenizer); } void HTMLTokenizer::restore_insertion_point() { rust_html_tokenizer_restore_insertion_point(m_tokenizer); } void HTMLTokenizer::update_insertion_point() { rust_html_tokenizer_update_insertion_point(m_tokenizer); } void HTMLTokenizer::abort() { rust_html_tokenizer_abort(m_tokenizer); } void HTMLTokenizer::switch_to(State new_state) { dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Switch to {}", state_name(m_state), state_name(new_state)); m_state = new_state; rust_html_tokenizer_switch_state(m_tokenizer, static_cast(new_state)); } }