| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> | 
					
						
							| 
									
										
										
										
											2022-02-15 18:45:10 +00:00
										 |  |  |  * Copyright (c) 2022, Linus Groh <linusg@serenityos.org> | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |  * | 
					
						
							| 
									
										
										
										
											2021-04-22 01:24:48 -07:00
										 |  |  |  * SPDX-License-Identifier: BSD-2-Clause | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #pragma once
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-26 15:50:05 +02:00
										 |  |  | #include <AK/Queue.h>
 | 
					
						
							| 
									
										
										
										
											2021-05-23 09:31:40 +02:00
										 |  |  | #include <AK/StringBuilder.h>
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | #include <AK/StringView.h>
 | 
					
						
							|  |  |  | #include <AK/Types.h>
 | 
					
						
							| 
									
										
										
										
											2020-06-04 21:06:54 +02:00
										 |  |  | #include <AK/Utf8View.h>
 | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  | #include <LibWeb/Forward.h>
 | 
					
						
							| 
									
										
										
										
											2020-07-28 19:18:23 +02:00
										 |  |  | #include <LibWeb/HTML/Parser/HTMLToken.h>
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-28 18:20:36 +02:00
										 |  |  | namespace Web::HTML { | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | #define ENUMERATE_TOKENIZER_STATES                                        \
 | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(Data)                                     \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RCDATA)                                   \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RAWTEXT)                                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptData)                               \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(PLAINTEXT)                                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(TagOpen)                                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(EndTagOpen)                               \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(TagName)                                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign)                       \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen)                         \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName)                         \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign)                      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen)                        \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName)                        \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign)                   \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen)                     \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName)                     \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart)                    \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped)                        \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash)                    \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign)            \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen)              \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName)              \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart)              \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped)                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash)              \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash)          \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign)      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName)                      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AttributeName)                            \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterAttributeName)                       \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue)                     \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted)               \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted)               \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted)                   \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag)                      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BogusComment)                             \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen)                    \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentStart)                             \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentStartDash)                         \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(Comment)                                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign)                      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang)                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash)              \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash)          \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentEndDash)                           \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentEnd)                               \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CommentEndBang)                           \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DOCTYPE)                                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName)                        \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DOCTYPEName)                              \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName)                         \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier)            \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted)      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted)      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier)             \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier)            \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted)      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted)      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier)             \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE)                             \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CDATASection)                             \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket)                      \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd)                          \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(CharacterReference)                       \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference)                  \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand)                       \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart)       \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart)           \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference)            \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference)                \ | 
					
						
							|  |  |  |     __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class HTMLTokenizer { | 
					
						
							|  |  |  | public: | 
					
						
							| 
									
										
										
										
											2022-02-19 15:58:21 +01:00
										 |  |  |     explicit HTMLTokenizer(); | 
					
						
							| 
									
										
										
										
											2021-11-11 00:55:02 +01:00
										 |  |  |     explicit HTMLTokenizer(StringView input, String const& encoding); | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  |     enum class State { | 
					
						
							|  |  |  | #define __ENUMERATE_TOKENIZER_STATE(state) state,
 | 
					
						
							|  |  |  |         ENUMERATE_TOKENIZER_STATES | 
					
						
							|  |  |  | #undef __ENUMERATE_TOKENIZER_STATE
 | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-24 00:14:23 +02:00
										 |  |  |     Optional<HTMLToken> next_token(); | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-15 18:45:10 +00:00
										 |  |  |     void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-25 23:15:48 +02:00
										 |  |  |     void switch_to(Badge<HTMLParser>, State new_state); | 
					
						
							| 
									
										
										
										
											2021-05-20 23:15:33 +04:30
										 |  |  |     void switch_to(State new_state) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         m_state = new_state; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-27 23:01:04 +02:00
										 |  |  |     void set_blocked(bool b) { m_blocked = b; } | 
					
						
							|  |  |  |     bool is_blocked() const { return m_blocked; } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-28 12:35:19 +02:00
										 |  |  |     String source() const { return m_decoded_input; } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-19 15:58:21 +01:00
										 |  |  |     void insert_input_at_insertion_point(String const& input); | 
					
						
							|  |  |  |     void insert_eof(); | 
					
						
							|  |  |  |     bool is_eof_inserted(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     bool is_insertion_point_defined() const { return m_insertion_point.defined; } | 
					
						
							|  |  |  |     bool is_insertion_point_reached() | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         return m_insertion_point.defined && m_insertion_point.position >= m_utf8_view.iterator_offset(m_utf8_iterator); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     void undefine_insertion_point() { m_insertion_point.defined = false; } | 
					
						
							|  |  |  |     void store_insertion_point() { m_old_insertion_point = m_insertion_point; } | 
					
						
							|  |  |  |     void restore_insertion_point() { m_insertion_point = m_old_insertion_point; } | 
					
						
							|  |  |  |     void update_insertion_point() | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         m_insertion_point.defined = true; | 
					
						
							|  |  |  |         m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-20 21:08:14 +02:00
										 |  |  |     // This permanently cuts off the tokenizer input stream.
 | 
					
						
							|  |  |  |     void abort() { m_aborted = true; } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | private: | 
					
						
							| 
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 |  |  |     void skip(size_t count); | 
					
						
							| 
									
										
										
										
											2020-08-05 16:31:20 -04:00
										 |  |  |     Optional<u32> next_code_point(); | 
					
						
							|  |  |  |     Optional<u32> peek_code_point(size_t offset) const; | 
					
						
							| 
									
										
										
										
											2021-11-11 00:55:02 +01:00
										 |  |  |     bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive); | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |     void create_new_token(HTMLToken::Type); | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  |     bool current_end_tag_token_is_appropriate() const; | 
					
						
							| 
									
										
										
										
											2021-05-23 08:20:03 +02:00
										 |  |  |     String consume_current_builder(); | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-12 12:44:21 +02:00
										 |  |  |     static char const* state_name(State state) | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |     { | 
					
						
							|  |  |  |         switch (state) { | 
					
						
							|  |  |  | #define __ENUMERATE_TOKENIZER_STATE(state) \
 | 
					
						
							|  |  |  |     case State::state:                     \ | 
					
						
							|  |  |  |         return #state; | 
					
						
							|  |  |  |             ENUMERATE_TOKENIZER_STATES | 
					
						
							|  |  |  | #undef __ENUMERATE_TOKENIZER_STATE
 | 
					
						
							|  |  |  |         }; | 
					
						
							| 
									
										
										
										
											2021-02-23 20:42:32 +01:00
										 |  |  |         VERIFY_NOT_REACHED(); | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  |     void will_emit(HTMLToken&); | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |     void will_switch_to(State); | 
					
						
							|  |  |  |     void will_reconsume_in(State); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-27 16:16:23 +02:00
										 |  |  |     bool consumed_as_part_of_an_attribute() const; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-12 12:44:21 +02:00
										 |  |  |     void restore_to(Utf8CodePointIterator const& new_iterator); | 
					
						
							| 
									
										
										
										
											2021-06-04 11:31:43 +02:00
										 |  |  |     HTMLToken::Position nth_last_position(size_t n = 0); | 
					
						
							| 
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-15 18:45:10 +00:00
										 |  |  |     HTMLParser* m_parser { nullptr }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |     State m_state { State::Data }; | 
					
						
							|  |  |  |     State m_return_state { State::Data }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-27 16:16:23 +02:00
										 |  |  |     Vector<u32> m_temporary_buffer; | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-28 12:35:19 +02:00
										 |  |  |     String m_decoded_input; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-19 15:58:21 +01:00
										 |  |  |     struct InsertionPoint { | 
					
						
							|  |  |  |         size_t position { 0 }; | 
					
						
							|  |  |  |         bool defined { false }; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  |     InsertionPoint m_insertion_point {}; | 
					
						
							|  |  |  |     InsertionPoint m_old_insertion_point {}; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-04 21:06:54 +02:00
										 |  |  |     Utf8View m_utf8_view; | 
					
						
							| 
									
										
										
										
											2021-06-01 09:45:52 +02:00
										 |  |  |     Utf8CodePointIterator m_utf8_iterator; | 
					
						
							|  |  |  |     Utf8CodePointIterator m_prev_utf8_iterator; | 
					
						
							| 
									
										
										
										
											2020-06-04 21:06:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  |     HTMLToken m_current_token; | 
					
						
							| 
									
										
										
										
											2021-05-23 08:20:03 +02:00
										 |  |  |     StringBuilder m_current_builder; | 
					
						
							| 
									
										
										
										
											2020-05-24 00:49:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-22 20:17:09 +02:00
										 |  |  |     Optional<String> m_last_emitted_start_tag_name; | 
					
						
							| 
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-19 15:58:21 +01:00
										 |  |  |     bool m_explicit_eof_inserted { false }; | 
					
						
							| 
									
										
										
										
											2020-05-24 00:49:22 +02:00
										 |  |  |     bool m_has_emitted_eof { false }; | 
					
						
							| 
									
										
										
										
											2020-05-26 15:50:05 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Queue<HTMLToken> m_queued_tokens; | 
					
						
							| 
									
										
										
										
											2020-05-27 16:16:23 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     u32 m_character_reference_code { 0 }; | 
					
						
							| 
									
										
										
										
											2020-05-27 23:01:04 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     bool m_blocked { false }; | 
					
						
							| 
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-20 21:08:14 +02:00
										 |  |  |     bool m_aborted { false }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 |  |  |     Vector<HTMLToken::Position> m_source_positions; | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | }; | 
					
						
							| 
									
										
										
										
											2020-05-27 23:01:04 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 |  |  | } |