2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								/*
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 *
							 | 
						
					
						
							
								
									
										
										
										
											2021-04-22 01:24:48 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								 * SPDX-License-Identifier: BSD-2-Clause
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 */
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#pragma once
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-26 15:50:05 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								#include <AK/Queue.h>
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-23 09:31:40 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								#include <AK/StringBuilder.h>
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#include <AK/StringView.h>
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#include <AK/Types.h>
							 | 
						
					
						
							
								
									
										
										
										
											2020-06-04 21:06:54 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								#include <AK/Utf8View.h>
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								#include <LibWeb/Forward.h>
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-28 19:18:23 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								#include <LibWeb/HTML/Parser/HTMLToken.h>
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-07-28 18:20:36 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								namespace Web::HTML {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#define ENUMERATE_TOKENIZER_STATES                                        \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(Data)                                     \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RCDATA)                                   \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RAWTEXT)                                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptData)                               \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(PLAINTEXT)                                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(TagOpen)                                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(EndTagOpen)                               \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(TagName)                                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign)                       \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen)                         \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName)                         \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign)                      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen)                        \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName)                        \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign)                   \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen)                     \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName)                     \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart)                    \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped)                        \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash)                    \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign)            \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen)              \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName)              \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart)              \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped)                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash)              \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash)          \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign)      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName)                      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AttributeName)                            \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterAttributeName)                       \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue)                     \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted)               \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted)               \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted)                   \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag)                      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BogusComment)                             \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen)                    \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentStart)                             \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentStartDash)                         \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(Comment)                                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign)                      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang)                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash)              \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash)          \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentEndDash)                           \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentEnd)                               \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CommentEndBang)                           \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DOCTYPE)                                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName)                        \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DOCTYPEName)                              \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName)                         \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier)            \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted)      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted)      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier)             \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier)            \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted)      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted)      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier)             \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE)                             \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CDATASection)                             \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket)                      \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd)                          \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(CharacterReference)                       \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference)                  \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand)                       \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart)       \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart)           \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference)            \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference)                \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class HTMLTokenizer {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								public:
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-12 12:44:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    explicit HTMLTokenizer(StringView const& input, String const& encoding);
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    enum class State {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#define __ENUMERATE_TOKENIZER_STATE(state) state,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ENUMERATE_TOKENIZER_STATES
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#undef __ENUMERATE_TOKENIZER_STATE
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    };
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 00:14:23 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Optional<HTMLToken> next_token();
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    void switch_to(Badge<HTMLDocumentParser>, State new_state);
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-20 23:15:33 +04:30
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    void switch_to(State new_state)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        m_state = new_state;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-27 23:01:04 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    void set_blocked(bool b) { m_blocked = b; }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    bool is_blocked() const { return m_blocked; }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-28 12:35:19 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    String source() const { return m_decoded_input; }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								private:
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    void skip(size_t count);
							 | 
						
					
						
							
								
									
										
										
										
											2020-08-05 16:31:20 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Optional<u32> next_code_point();
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Optional<u32> peek_code_point(size_t offset) const;
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-12 12:44:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    bool consume_next_if_match(StringView const&, CaseSensitivity = CaseSensitivity::CaseSensitive);
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    void create_new_token(HTMLToken::Type);
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    bool current_end_tag_token_is_appropriate() const;
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-23 08:20:03 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    String consume_current_builder();
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-12 12:44:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    static char const* state_name(State state)
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        switch (state) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#define __ENUMERATE_TOKENIZER_STATE(state) \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    case State::state:                     \
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return #state;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ENUMERATE_TOKENIZER_STATES
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								#undef __ENUMERATE_TOKENIZER_STATE
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        };
							 | 
						
					
						
							
								
									
										
										
										
											2021-02-23 20:42:32 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        VERIFY_NOT_REACHED();
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    void will_emit(HTMLToken&);
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    void will_switch_to(State);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    void will_reconsume_in(State);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-27 16:16:23 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    bool consumed_as_part_of_an_attribute() const;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-07-12 12:44:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    void restore_to(Utf8CodePointIterator const& new_iterator);
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-04 11:31:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    HTMLToken::Position nth_last_position(size_t n = 0);
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    State m_state { State::Data };
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    State m_return_state { State::Data };
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-27 16:16:23 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Vector<u32> m_temporary_buffer;
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-28 12:35:19 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    String m_decoded_input;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-06-04 21:06:54 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Utf8View m_utf8_view;
							 | 
						
					
						
							
								
									
										
										
										
											2021-06-01 09:45:52 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Utf8CodePointIterator m_utf8_iterator;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Utf8CodePointIterator m_prev_utf8_iterator;
							 | 
						
					
						
							
								
									
										
										
										
											2020-06-04 21:06:54 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    HTMLToken m_current_token;
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-23 08:20:03 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    StringBuilder m_current_builder;
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 00:49:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-22 20:17:09 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Optional<String> m_last_emitted_start_tag_name;
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 20:24:43 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-24 00:49:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    bool m_has_emitted_eof { false };
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-26 15:50:05 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Queue<HTMLToken> m_queued_tokens;
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-27 16:16:23 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    u32 m_character_reference_code { 0 };
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-27 23:01:04 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    bool m_blocked { false };
							 | 
						
					
						
							
								
									
										
										
										
											2021-05-20 23:11:41 +04:30
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Vector<HTMLToken::Position> m_source_positions;
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								};
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-27 23:01:04 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2020-05-22 21:46:13 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 |