From 3593c3b687a7a67ace5787aa5e9a8a1b22a5bfcf Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Thu, 23 Oct 2025 21:45:00 +0200 Subject: [PATCH] LibWeb: Throw out decoded UTF-32 data in HTMLTokenizer after parser runs This ends up saving quite a bit of memory on many pages, since UTF-32 uses 4 bytes per code points. As an example, it reduces the footprint on https://gymgrossisten.com/ by 2 MiB. --- Libraries/LibWeb/HTML/Parser/HTMLParser.cpp | 2 ++ Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp | 14 ++++++++++++++ Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h | 2 ++ 3 files changed, 18 insertions(+) diff --git a/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp index 94a5b86fbfa..d4043934049 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp @@ -259,6 +259,8 @@ void HTMLParser::run(HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point } flush_character_insertions(); + + m_tokenizer.parser_did_run({}); } void HTMLParser::run(URL::URL const& url, HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point) diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index e61985512d8..ccf08bfa9df 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -2895,6 +2895,20 @@ HTMLTokenizer::HTMLTokenizer(StringView input, ByteString const& encoding) m_source_positions.empend(0u, 0u); } +void HTMLTokenizer::parser_did_run(Badge) +{ + // OPTIMIZATION: If we've consumed all input and the insertion point is at the start, + // we can throw away the decoded input buffer to save memory. + if (m_current_offset > 0 + && static_cast(m_current_offset) == m_decoded_input.size() + && (!m_insertion_point.has_value() || *m_insertion_point == 0) + && (!m_old_insertion_point.has_value() || *m_old_insertion_point == 0)) { + m_decoded_input.clear(); + m_current_offset = 0; + m_prev_offset = 0; + } +} + void HTMLTokenizer::insert_input_at_insertion_point(StringView input) { Vector new_decoded_input; diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h index 7c1c827da95..fc664d5417f 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h @@ -145,6 +145,8 @@ public: // This permanently cuts off the tokenizer input stream. void abort() { m_aborted = true; } + void parser_did_run(Badge); + private: void skip(size_t count); Optional next_code_point(StopAtInsertionPoint);