ladybird/Libraries/LibWeb/HTML/Parser/Rust/build.rs
Andreas Kling 171e3adf01 LibWeb: Replace the HTML tokenizer with Rust
Replace the C++ HTML tokenizer with a Rust implementation behind the
existing HTMLTokenizer API.

Keep the parser-facing integration points for streaming input,
insertion points, document.write(), EOF insertion, parser aborts,
speculative parser input, and last start tag tracking. The generated
FFI handle stays an implementation detail of HTMLTokenizer, so callers
keep a single tokenizer class.

Preserve duplicate attributes through FFI so C++ token normalization can
record the duplicate-attribute signal used by CSP nonce checks. Keep
bulk tag-name and attribute scans capped at the active insertion point
so streamed parser input is spliced at the right offset.

Use generated DAFSA tables for named character references and intern
common tag and attribute names to reduce FFI marshalling overhead. This
also fixes attribute name source positions, nested old insertion points,
and aborted fast-path handling.

TestHTMLTokenizer covers duplicate attributes and insertion points in
fast tag-name, attribute-name, and quoted-value scans. A CSP text test
covers duplicate nonce attributes on parser-created script elements.
The tokenizer dump fixtures still match, TestHTMLTokenizer passes, and
the full release test-web run passes with 6981 tests and 226 skipped.
2026-05-15 21:01:40 +02:00

821 lines
28 KiB
Rust

/*
* Copyright (c) 2026-present, the Ladybird developers.
*
* SPDX-License-Identifier: BSD-2-Clause
*/
//! Build script that generates a DAFSA (Deterministic Acyclic Finite State Automaton)
//! for named character reference matching. This is a Rust port of the C++ generator at
//! Meta/Lagom/Tools/CodeGenerators/LibWeb/GenerateNamedCharacterReferences.cpp.
use std::cell::RefCell;
use std::collections::HashMap;
use std::env;
use std::fs;
use std::path::{Path, PathBuf};
use std::rc::Rc;
const FFI_HEADER: &str = "HTMLTokenizerRustFFI.h";
fn main() {
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=cbindgen.toml");
println!("cargo:rerun-if-changed=src");
println!("cargo:rerun-if-env-changed=FFI_OUTPUT_DIR");
let ffi_out_dir = env::var("FFI_OUTPUT_DIR")
.map(PathBuf::from)
.unwrap_or_else(|_| out_dir.clone());
cbindgen::generate(&manifest_dir).map_or_else(
|error| match error {
cbindgen::Error::ParseSyntaxError { .. } => {}
e => panic!("{e:?}"),
},
|bindings| {
bindings.write_to_file(out_dir.join(FFI_HEADER));
if ffi_out_dir != out_dir {
bindings.write_to_file(ffi_out_dir.join(FFI_HEADER));
}
},
);
// Generate interned name tables from the existing C++ headers.
let tag_names_header = Path::new(&manifest_dir).join("../../TagNames.h");
let attr_names_header = Path::new(&manifest_dir).join("../../AttributeNames.h");
println!("cargo:rerun-if-changed={}", tag_names_header.display());
println!("cargo:rerun-if-changed={}", attr_names_header.display());
let tag_names = parse_enumerate_macro(
&fs::read_to_string(&tag_names_header).expect("Failed to read TagNames.h"),
"__ENUMERATE_HTML_TAG",
);
let attr_names = parse_enumerate_macro(
&fs::read_to_string(&attr_names_header).expect("Failed to read AttributeNames.h"),
"__ENUMERATE_HTML_ATTRIBUTE",
);
emit_interned_names(&out_dir.join("interned_names_generated.rs"), &tag_names, &attr_names);
let json_path = Path::new(&manifest_dir).join("../Entities.json");
println!("cargo:rerun-if-changed={}", json_path.display());
let json_str = fs::read_to_string(&json_path).expect("Failed to read Entities.json");
let entities = parse_entities_json(&json_str);
// Build DAFSA.
let mut builder = DafsaBuilder::new();
for (name, _, _) in &entities {
builder.insert(name);
}
builder.minimize(0);
builder.calc_numbers();
// Verify minimal perfect hashing (no collisions).
let mut seen: Vec<bool> = vec![false; entities.len() + 1];
for (name, _, _) in &entities {
let idx = builder.get_unique_index(name).unwrap();
assert!(!seen[idx], "Hash collision at index {idx} for '{name}'");
seen[idx] = true;
}
// Build codepoints lookup table indexed by unique_index.
let mut index_to_codepoints = vec![(0u32, 0u32); entities.len()];
for (name, first, second) in &entities {
let idx = builder.get_unique_index(name).unwrap();
index_to_codepoints[idx - 1] = (*first, *second);
}
// Extract DAFSA layers.
let root = &builder.root;
let root_ref = root.borrow();
let mut first_layer: Vec<u16> = Vec::new();
let mut first_to_second_layer: Vec<(u64, u16)> = Vec::new();
let mut first_layer_tally: u16 = 0;
let mut second_layer_offset: u16 = 0;
for c in 0u8..128 {
if root_ref.children[c as usize].is_none() {
continue;
}
assert!(c.is_ascii_alphabetic());
let child = root_ref.children[c as usize].as_ref().unwrap();
let child_ref = child.borrow();
first_layer.push(first_layer_tally);
first_layer_tally += child_ref.number;
let mask = child_ref.get_ascii_alphabetic_bit_mask();
first_to_second_layer.push((mask, second_layer_offset));
second_layer_offset += child_ref.num_direct_children() as u16;
}
assert_eq!(first_layer.len(), 52);
// BFS to build DAFSA node array.
// Following the C++ write_node_data three-phase approach:
type NodePtr = Rc<RefCell<Node>>;
let mut queue: Vec<NodePtr> = Vec::new();
let mut child_indexes: HashMap<*const RefCell<Node>, u16> = HashMap::new();
// Phase 1: Queue root's children (first-layer nodes = 52 A-Z/a-z).
// This assigns temporary child_indexes for first-layer children.
queue_children(root, &mut queue, &mut child_indexes, 1);
// Phase 2: Clear indexes and re-process. For each first-layer child,
// queue ITS children (second-layer nodes) and assign their child_indexes.
child_indexes.clear();
let mut first_available_index: u16 = 1; // 0 is reserved (dummy node)
let first_layer_count = queue.len();
for i in 0..first_layer_count {
let node = Rc::clone(&queue[i]);
first_available_index = queue_children(&node, &mut queue, &mut child_indexes, first_available_index);
}
// Remove first-layer nodes from queue, keep only second-layer+ nodes.
let second_layer_nodes: Vec<NodePtr> = queue.drain(first_layer_count..).collect();
queue.clear();
queue.extend(second_layer_nodes);
// Phase 3: BFS remaining nodes, writing node data.
let mut node_data: Vec<NodeData> = Vec::new();
let mut qi = 0;
#[allow(unused_assignments)]
while qi < queue.len() {
let node = Rc::clone(&queue[qi]);
qi += 1;
first_available_index = write_children_data(
&node,
&mut node_data,
&mut queue,
&mut child_indexes,
first_available_index,
);
}
// Build second_layer entries with child_indexes from phase 2.
let mut second_layer: Vec<SecondLayerEntry> = Vec::new();
for c in 0u8..128 {
if root_ref.children[c as usize].is_none() {
continue;
}
let first_child = root_ref.children[c as usize].as_ref().unwrap();
let first_child_ref = first_child.borrow();
let mut tally: u8 = 0;
for cc in 0u8..128 {
if first_child_ref.children[cc as usize].is_none() {
continue;
}
let second_child = first_child_ref.children[cc as usize].as_ref().unwrap();
let second_child_ref = second_child.borrow();
let key = Rc::as_ptr(second_child);
let ci = child_indexes.get(&key).copied().unwrap_or(0);
let children_len = second_child_ref.num_direct_children();
second_layer.push(SecondLayerEntry {
child_index: ci,
number: tally,
children_len,
end_of_word: second_child_ref.is_terminal,
});
tally = tally.wrapping_add(second_child_ref.number as u8);
}
}
drop(root_ref);
// Generate output file.
let out_dir = env::var("OUT_DIR").unwrap();
let out_path = Path::new(&out_dir).join("named_character_references.rs");
let mut out = String::new();
out.push_str("// Auto-generated by build.rs -- do not edit!\n\n");
// Second codepoint enum.
out.push_str("#[derive(Clone, Copy, PartialEq, Eq)]\n");
out.push_str("#[repr(u8)]\n");
out.push_str("pub enum SecondCodepoint {\n");
out.push_str(" None = 0,\n");
out.push_str(" CombiningLongSolidusOverlay = 1,\n");
out.push_str(" CombiningLongVerticalLineOverlay = 2,\n");
out.push_str(" HairSpace = 3,\n");
out.push_str(" CombiningDoubleLowLine = 4,\n");
out.push_str(" CombiningReverseSolidusOverlay = 5,\n");
out.push_str(" VariationSelector1 = 6,\n");
out.push_str(" LatinSmallLetterJ = 7,\n");
out.push_str(" CombiningMacronBelow = 8,\n");
out.push_str("}\n\n");
out.push_str("impl SecondCodepoint {\n");
out.push_str(" pub fn value(self) -> u32 {\n");
out.push_str(" match self {\n");
out.push_str(" SecondCodepoint::None => 0,\n");
out.push_str(" SecondCodepoint::CombiningLongSolidusOverlay => 0x0338,\n");
out.push_str(" SecondCodepoint::CombiningLongVerticalLineOverlay => 0x20D2,\n");
out.push_str(" SecondCodepoint::HairSpace => 0x200A,\n");
out.push_str(" SecondCodepoint::CombiningDoubleLowLine => 0x0333,\n");
out.push_str(" SecondCodepoint::CombiningReverseSolidusOverlay => 0x20E5,\n");
out.push_str(" SecondCodepoint::VariationSelector1 => 0xFE00,\n");
out.push_str(" SecondCodepoint::LatinSmallLetterJ => 0x006A,\n");
out.push_str(" SecondCodepoint::CombiningMacronBelow => 0x0331,\n");
out.push_str(" }\n");
out.push_str(" }\n");
out.push_str("}\n\n");
// Struct definitions.
out.push_str("#[derive(Clone, Copy)]\n");
out.push_str("pub struct DafsaNode {\n");
out.push_str(" pub character: u8,\n");
out.push_str(" pub number: u8,\n");
out.push_str(" pub end_of_word: bool,\n");
out.push_str(" pub child_index: u16,\n");
out.push_str(" pub children_len: u8,\n");
out.push_str("}\n\n");
out.push_str("#[derive(Clone, Copy)]\n");
out.push_str("pub struct SecondLayerNode {\n");
out.push_str(" pub child_index: u16,\n");
out.push_str(" pub number: u8,\n");
out.push_str(" pub children_len: u8,\n");
out.push_str(" pub end_of_word: bool,\n");
out.push_str("}\n\n");
// Codepoints lookup table.
out.push_str(&format!(
"pub static CODEPOINTS_LOOKUP: [(u32, SecondCodepoint); {}] = [\n",
index_to_codepoints.len()
));
for (first, second) in &index_to_codepoints {
let variant = second_codepoint_variant(*second);
out.push_str(&format!(" ({first:#06X}, SecondCodepoint::{variant}),\n"));
}
out.push_str("];\n\n");
// DAFSA nodes array (with dummy node at index 0).
out.push_str(&format!(
"pub static DAFSA_NODES: [DafsaNode; {}] = [\n",
node_data.len() + 1
));
out.push_str(" DafsaNode { character: 0, number: 0, end_of_word: false, child_index: 0, children_len: 0 },\n");
for nd in &node_data {
out.push_str(&format!(
" DafsaNode {{ character: b'{}', number: {}, end_of_word: {}, child_index: {}, children_len: {} }},\n",
escape_byte(nd.character),
nd.number,
nd.end_of_word,
nd.child_index,
nd.children_len
));
}
out.push_str("];\n\n");
// First layer.
out.push_str(&format!("pub static FIRST_LAYER: [u16; {}] = [\n", first_layer.len()));
for n in &first_layer {
out.push_str(&format!(" {n},\n"));
}
out.push_str("];\n\n");
// First-to-second layer links.
out.push_str(&format!(
"pub static FIRST_TO_SECOND_LAYER: [(u64, u16); {}] = [\n",
first_to_second_layer.len()
));
for (mask, offset) in &first_to_second_layer {
out.push_str(&format!(" ({mask:#018X}, {offset}),\n"));
}
out.push_str("];\n\n");
// Second layer nodes.
out.push_str(&format!(
"pub static SECOND_LAYER: [SecondLayerNode; {}] = [\n",
second_layer.len()
));
for sl in &second_layer {
out.push_str(&format!(
" SecondLayerNode {{ child_index: {}, number: {}, children_len: {}, end_of_word: {} }},\n",
sl.child_index, sl.number, sl.children_len, sl.end_of_word
));
}
out.push_str("];\n\n");
// Total entity count.
out.push_str(&format!("pub const ENTITY_COUNT: usize = {};\n", entities.len()));
fs::write(&out_path, &out).expect("Failed to write generated file");
}
/// Extract the string literal from `__ENUMERATE_FOO(ident, "string")` macro
/// invocations in a C++ header.
fn parse_enumerate_macro(source: &str, macro_name: &str) -> Vec<String> {
let needle = format!("{macro_name}(");
let mut out = Vec::new();
for line in source.lines() {
let Some(idx) = line.find(&needle) else {
continue;
};
let rest = &line[idx + needle.len()..];
// Take the second argument, which is the quoted string literal.
let Some(first_quote) = rest.find('"') else {
continue;
};
let after = &rest[first_quote + 1..];
let Some(end_quote) = after.find('"') else {
continue;
};
out.push(after[..end_quote].to_string());
}
out
}
/// Emit a Rust source file with two const byte-slice arrays and two lookup
/// functions that dispatch on length and then on the exact bytes. rustc
/// compiles this pattern to a jump table + direct memcmp, which beats a
/// HashMap lookup with a cryptographic default hasher by a wide margin for
/// the small, fixed set of HTML names.
fn emit_interned_names(out_path: &Path, tag_names: &[String], attr_names: &[String]) {
let mut out = String::new();
out.push_str("// Auto-generated by build.rs from TagNames.h / AttributeNames.h.\n");
out.push_str("// Do not edit by hand.\n\n");
out.push_str("pub const INTERNED_TAG_NAMES: &[&[u8]] = &[\n");
for name in tag_names {
out.push_str(&format!(" b\"{}\",\n", name));
}
out.push_str("];\n\n");
out.push_str("pub const INTERNED_ATTR_NAMES: &[&[u8]] = &[\n");
for name in attr_names {
out.push_str(&format!(" b\"{}\",\n", name));
}
out.push_str("];\n\n");
emit_lookup_fn(&mut out, "lookup_tag_name_generated", tag_names);
emit_lookup_fn(&mut out, "lookup_attr_name_generated", attr_names);
fs::write(out_path, out).expect("Failed to write interned_names_generated.rs");
}
fn emit_lookup_fn(out: &mut String, fn_name: &str, names: &[String]) {
// Group names by byte length so the outer dispatch can be a single match.
let mut by_length: std::collections::BTreeMap<usize, Vec<(usize, &String)>> = std::collections::BTreeMap::new();
for (i, name) in names.iter().enumerate() {
by_length.entry(name.len()).or_default().push((i, name));
}
out.push_str(&format!("#[inline]\npub fn {fn_name}(bytes: &[u8]) -> u16 {{\n"));
out.push_str(" match bytes.len() {\n");
for (length, entries) in &by_length {
out.push_str(&format!(" {length} => match bytes {{\n"));
for (index, name) in entries {
// id is 1-based.
let id = index + 1;
out.push_str(&format!(" b\"{name}\" => {id},\n"));
}
out.push_str(" _ => 0,\n");
out.push_str(" },\n");
}
out.push_str(" _ => 0,\n");
out.push_str(" }\n");
out.push_str("}\n\n");
}
fn escape_byte(b: u8) -> String {
if b == b'\'' {
"\\'".to_string()
} else if b == b'\\' {
"\\\\".to_string()
} else if b.is_ascii_graphic() || b == b' ' {
String::from(b as char)
} else {
format!("\\x{b:02X}")
}
}
fn second_codepoint_variant(cp: u32) -> &'static str {
match cp {
0 => "None",
0x0338 => "CombiningLongSolidusOverlay",
0x20D2 => "CombiningLongVerticalLineOverlay",
0x200A => "HairSpace",
0x0333 => "CombiningDoubleLowLine",
0x20E5 => "CombiningReverseSolidusOverlay",
0xFE00 => "VariationSelector1",
0x006A => "LatinSmallLetterJ",
0x0331 => "CombiningMacronBelow",
_ => panic!("Unknown second codepoint: {cp:#X}"),
}
}
// Minimal JSON parser for Entities.json.
fn parse_entities_json(json: &str) -> Vec<(String, u32, u32)> {
let mut entities = Vec::new();
let bytes = json.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len && bytes[i] != b'{' {
i += 1;
}
i += 1;
loop {
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
if i >= len || bytes[i] == b'}' {
break;
}
if bytes[i] == b',' {
i += 1;
continue;
}
// Parse key.
assert_eq!(bytes[i], b'"');
i += 1;
let key_start = i;
while i < len && bytes[i] != b'"' {
if bytes[i] == b'\\' {
i += 1;
}
i += 1;
}
let key = std::str::from_utf8(&bytes[key_start..i]).unwrap().to_string();
i += 1;
// Skip ':'.
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
assert_eq!(bytes[i], b':');
i += 1;
// Skip to inner '{'.
while i < len && bytes[i] != b'{' {
i += 1;
}
i += 1;
// Parse inner object for "codepoints".
let mut codepoints: Vec<u32> = Vec::new();
while i < len && bytes[i] != b'}' {
if bytes[i] == b'"' {
i += 1;
let field_start = i;
while i < len && bytes[i] != b'"' {
i += 1;
}
let field_name = std::str::from_utf8(&bytes[field_start..i]).unwrap();
i += 1;
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
assert_eq!(bytes[i], b':');
i += 1;
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
if field_name == "codepoints" {
assert_eq!(bytes[i], b'[');
i += 1;
loop {
while i < len && (bytes[i].is_ascii_whitespace() || bytes[i] == b',') {
i += 1;
}
if i >= len || bytes[i] == b']' {
i += 1;
break;
}
let num_start = i;
while i < len && bytes[i].is_ascii_digit() {
i += 1;
}
let num_str = std::str::from_utf8(&bytes[num_start..i]).unwrap();
codepoints.push(num_str.parse().unwrap());
}
} else {
// Skip value.
if bytes[i] == b'"' {
i += 1;
while i < len && bytes[i] != b'"' {
if bytes[i] == b'\\' {
i += 1;
}
i += 1;
}
i += 1;
} else if bytes[i] == b'[' {
let mut depth = 1;
i += 1;
while i < len && depth > 0 {
if bytes[i] == b'[' {
depth += 1;
} else if bytes[i] == b']' {
depth -= 1;
}
i += 1;
}
}
}
} else {
i += 1;
}
}
i += 1;
let name = key.strip_prefix('&').unwrap_or(&key).to_string();
let first = codepoints.first().copied().unwrap_or(0);
let second = if codepoints.len() > 1 { codepoints[1] } else { 0 };
entities.push((name, first, second));
}
entities.sort_by(|a, b| a.0.cmp(&b.0));
entities
}
// DAFSA builder using Rc<RefCell<Node>> for shared ownership.
type NodeRc = Rc<RefCell<Node>>;
struct Node {
children: Vec<Option<NodeRc>>, // 128 slots
is_terminal: bool,
number: u16,
}
struct SecondLayerEntry {
child_index: u16,
number: u8,
children_len: u8,
end_of_word: bool,
}
struct NodeData {
character: u8,
number: u8,
end_of_word: bool,
child_index: u16,
children_len: u8,
}
impl Node {
fn new_rc() -> NodeRc {
Rc::new(RefCell::new(Node {
children: (0..128).map(|_| Option::None).collect(),
is_terminal: false,
number: 0,
}))
}
fn calc_numbers(&mut self) {
self.number = if self.is_terminal { 1 } else { 0 };
for child in self.children.iter().flatten() {
child.borrow_mut().calc_numbers();
self.number += child.borrow().number;
}
}
fn num_direct_children(&self) -> u8 {
let mut n = 0u8;
for c in &self.children {
if c.is_some() {
n += 1;
}
}
n
}
fn get_ascii_alphabetic_bit_mask(&self) -> u64 {
let mut mask = 0u64;
for i in 0..128u8 {
if self.children[i as usize].is_some() {
mask |= 1u64 << ascii_alphabetic_to_index(i);
}
}
mask
}
/// Hash based on child identities (Rc pointer) and terminal status.
fn structure_hash(&self) -> u64 {
let mut h: u64 = if self.is_terminal { 1 } else { 0 };
for (i, child) in self.children.iter().enumerate() {
if let Some(c) = child {
h = h.wrapping_mul(31).wrapping_add(i as u64);
h = h.wrapping_mul(31).wrapping_add(Rc::as_ptr(c) as u64);
}
}
h
}
/// Check structural equality via Rc pointer identity.
fn structure_eq(&self, other: &Node) -> bool {
if self.is_terminal != other.is_terminal {
return false;
}
for i in 0..128 {
match (&self.children[i], &other.children[i]) {
(None, None) => {}
(Some(a), Some(b)) => {
if !Rc::ptr_eq(a, b) {
return false;
}
}
_ => return false,
}
}
true
}
}
fn ascii_alphabetic_to_index(c: u8) -> u8 {
if c <= b'Z' { c - b'A' } else { c - b'a' + 26 }
}
struct UncheckedNode {
parent: NodeRc,
character: u8,
}
struct DafsaBuilder {
root: NodeRc,
minimized_nodes: HashMap<u64, Vec<NodeRc>>,
unchecked_nodes: Vec<UncheckedNode>,
previous_word: String,
}
impl DafsaBuilder {
fn new() -> Self {
DafsaBuilder {
root: Node::new_rc(),
minimized_nodes: HashMap::new(),
unchecked_nodes: Vec::new(),
previous_word: String::new(),
}
}
fn insert(&mut self, word: &str) {
assert!(
word > self.previous_word.as_str(),
"Words must be inserted in sorted order: '{word}' <= '{}'",
self.previous_word
);
let common_prefix_len = word
.bytes()
.zip(self.previous_word.bytes())
.take_while(|(a, b)| a == b)
.count();
self.minimize(common_prefix_len);
let node: NodeRc = if self.unchecked_nodes.is_empty() {
Rc::clone(&self.root)
} else {
let last = &self.unchecked_nodes[self.unchecked_nodes.len() - 1];
let parent = last.parent.borrow();
Rc::clone(parent.children[last.character as usize].as_ref().unwrap())
};
let remaining = &word[common_prefix_len..];
let mut current = node;
for c in remaining.bytes() {
let new_child = Node::new_rc();
{
let mut current_ref = current.borrow_mut();
assert!(current_ref.children[c as usize].is_none());
current_ref.children[c as usize] = Some(Rc::clone(&new_child));
}
self.unchecked_nodes.push(UncheckedNode {
parent: Rc::clone(&current),
character: c,
});
current = new_child;
}
current.borrow_mut().is_terminal = true;
self.previous_word = word.to_string();
}
fn minimize(&mut self, down_to: usize) {
while self.unchecked_nodes.len() > down_to {
let unchecked = self.unchecked_nodes.pop().unwrap();
let parent = &unchecked.parent;
let child = {
let parent_ref = parent.borrow();
Rc::clone(parent_ref.children[unchecked.character as usize].as_ref().unwrap())
};
let hash = child.borrow().structure_hash();
let mut found_replacement: Option<NodeRc> = Option::None;
if let Some(bucket) = self.minimized_nodes.get(&hash) {
for existing in bucket {
if child.borrow().structure_eq(&existing.borrow()) {
found_replacement = Some(Rc::clone(existing));
break;
}
}
}
if let Some(replacement) = found_replacement {
parent.borrow_mut().children[unchecked.character as usize] = Some(replacement);
} else {
self.minimized_nodes.entry(hash).or_default().push(Rc::clone(&child));
}
}
}
fn calc_numbers(&mut self) {
self.root.borrow_mut().calc_numbers();
}
fn get_unique_index(&self, word: &str) -> Option<usize> {
let mut index: usize = 0;
let mut current = Rc::clone(&self.root);
for c in word.bytes() {
let next = {
let node = current.borrow();
let child = node.children[c as usize].as_ref()?;
for sibling_c in 0u8..128 {
if let Some(sibling) = &node.children[sibling_c as usize]
&& sibling_c < c
{
index += sibling.borrow().number as usize;
}
}
Rc::clone(child)
};
if next.borrow().is_terminal {
index += 1;
}
current = next;
}
Some(index)
}
}
fn queue_children(
node: &NodeRc,
queue: &mut Vec<NodeRc>,
child_indexes: &mut HashMap<*const RefCell<Node>, u16>,
first_available_index: u16,
) -> u16 {
let mut current = first_available_index;
let node_ref = node.borrow();
for c in 0..128u8 {
if let Some(child) = &node_ref.children[c as usize] {
let key = Rc::as_ptr(child);
if let std::collections::hash_map::Entry::Vacant(entry) = child_indexes.entry(key) {
let num_children = child.borrow().num_direct_children();
if num_children > 0 {
entry.insert(current);
current += num_children as u16;
}
queue.push(Rc::clone(child));
}
}
}
current
}
fn write_children_data(
node: &NodeRc,
node_data: &mut Vec<NodeData>,
queue: &mut Vec<NodeRc>,
child_indexes: &mut HashMap<*const RefCell<Node>, u16>,
first_available_index: u16,
) -> u16 {
let mut current = first_available_index;
let mut unique_index_tally: u8 = 0;
let node_ref = node.borrow();
for c in 0..128u8 {
if let Some(child) = &node_ref.children[c as usize] {
let key = Rc::as_ptr(child);
let child_ref = child.borrow();
let num_children = child_ref.num_direct_children();
if let std::collections::hash_map::Entry::Vacant(entry) = child_indexes.entry(key) {
if num_children > 0 {
entry.insert(current);
current += num_children as u16;
}
queue.push(Rc::clone(child));
}
node_data.push(NodeData {
character: c,
number: unique_index_tally,
end_of_word: child_ref.is_terminal,
child_index: child_indexes.get(&key).copied().unwrap_or(0),
children_len: num_children,
});
unique_index_tally = unique_index_tally.wrapping_add(child_ref.number as u8);
}
}
current
}