ladybird/Libraries/LibWeb/HTML/Parser/Rust/build.rs

821 lines
28 KiB
Rust

/*
* Copyright (c) 2026-present, the Ladybird developers.
*
* SPDX-License-Identifier: BSD-2-Clause
*/
//! Build script that generates a DAFSA (Deterministic Acyclic Finite State Automaton)
//! for named character reference matching.
use std::cell::RefCell;
use std::collections::HashMap;
use std::env;
use std::fs;
use std::path::Path;
use std::path::PathBuf;
use std::rc::Rc;
const FFI_HEADER: &str = "HTMLTokenizerRustFFI.h";
fn main() {
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=cbindgen.toml");
println!("cargo:rerun-if-changed=src");
println!("cargo:rerun-if-env-changed=FFI_OUTPUT_DIR");
let ffi_out_dir = env::var("FFI_OUTPUT_DIR")
.map(PathBuf::from)
.unwrap_or_else(|_| out_dir.clone());
cbindgen::generate(&manifest_dir).map_or_else(
|error| match error {
cbindgen::Error::ParseSyntaxError { .. } => {}
e => panic!("{e:?}"),
},
|bindings| {
bindings.write_to_file(out_dir.join(FFI_HEADER));
if ffi_out_dir != out_dir {
bindings.write_to_file(ffi_out_dir.join(FFI_HEADER));
}
},
);
// Generate interned name tables from the existing C++ headers.
let tag_names_header = Path::new(&manifest_dir).join("../../TagNames.h");
let attr_names_header = Path::new(&manifest_dir).join("../../AttributeNames.h");
println!("cargo:rerun-if-changed={}", tag_names_header.display());
println!("cargo:rerun-if-changed={}", attr_names_header.display());
let tag_names = parse_enumerate_macro(
&fs::read_to_string(&tag_names_header).expect("Failed to read TagNames.h"),
"__ENUMERATE_HTML_TAG",
);
let attr_names = parse_enumerate_macro(
&fs::read_to_string(&attr_names_header).expect("Failed to read AttributeNames.h"),
"__ENUMERATE_HTML_ATTRIBUTE",
);
emit_interned_names(&out_dir.join("interned_names_generated.rs"), &tag_names, &attr_names);
let json_path = Path::new(&manifest_dir).join("../Entities.json");
println!("cargo:rerun-if-changed={}", json_path.display());
let json_str = fs::read_to_string(&json_path).expect("Failed to read Entities.json");
let entities = parse_entities_json(&json_str);
// Build DAFSA.
let mut builder = DafsaBuilder::new();
for (name, _, _) in &entities {
builder.insert(name);
}
builder.minimize(0);
builder.calc_numbers();
// Verify minimal perfect hashing (no collisions).
let mut seen: Vec<bool> = vec![false; entities.len() + 1];
for (name, _, _) in &entities {
let idx = builder.get_unique_index(name).unwrap();
assert!(!seen[idx], "Hash collision at index {idx} for '{name}'");
seen[idx] = true;
}
// Build codepoints lookup table indexed by unique_index.
let mut index_to_codepoints = vec![(0u32, 0u32); entities.len()];
for (name, first, second) in &entities {
let idx = builder.get_unique_index(name).unwrap();
index_to_codepoints[idx - 1] = (*first, *second);
}
// Extract DAFSA layers.
let root = &builder.root;
let root_ref = root.borrow();
let mut first_layer: Vec<u16> = Vec::new();
let mut first_to_second_layer: Vec<(u64, u16)> = Vec::new();
let mut first_layer_tally: u16 = 0;
let mut second_layer_offset: u16 = 0;
for c in 0u8..128 {
if root_ref.children[c as usize].is_none() {
continue;
}
assert!(c.is_ascii_alphabetic());
let child = root_ref.children[c as usize].as_ref().unwrap();
let child_ref = child.borrow();
first_layer.push(first_layer_tally);
first_layer_tally += child_ref.number;
let mask = child_ref.get_ascii_alphabetic_bit_mask();
first_to_second_layer.push((mask, second_layer_offset));
second_layer_offset += child_ref.num_direct_children() as u16;
}
assert_eq!(first_layer.len(), 52);
// BFS to build DAFSA node array.
// Following the C++ write_node_data three-phase approach:
type NodePtr = Rc<RefCell<Node>>;
let mut queue: Vec<NodePtr> = Vec::new();
let mut child_indexes: HashMap<*const RefCell<Node>, u16> = HashMap::new();
// Phase 1: Queue root's children (first-layer nodes = 52 A-Z/a-z).
// This assigns temporary child_indexes for first-layer children.
queue_children(root, &mut queue, &mut child_indexes, 1);
// Phase 2: Clear indexes and re-process. For each first-layer child,
// queue ITS children (second-layer nodes) and assign their child_indexes.
child_indexes.clear();
let mut first_available_index: u16 = 1; // 0 is reserved (dummy node)
let first_layer_count = queue.len();
for i in 0..first_layer_count {
let node = Rc::clone(&queue[i]);
first_available_index = queue_children(&node, &mut queue, &mut child_indexes, first_available_index);
}
// Remove first-layer nodes from queue, keep only second-layer+ nodes.
let second_layer_nodes: Vec<NodePtr> = queue.drain(first_layer_count..).collect();
queue.clear();
queue.extend(second_layer_nodes);
// Phase 3: BFS remaining nodes, writing node data.
let mut node_data: Vec<NodeData> = Vec::new();
let mut qi = 0;
#[allow(unused_assignments)]
while qi < queue.len() {
let node = Rc::clone(&queue[qi]);
qi += 1;
first_available_index = write_children_data(
&node,
&mut node_data,
&mut queue,
&mut child_indexes,
first_available_index,
);
}
// Build second_layer entries with child_indexes from phase 2.
let mut second_layer: Vec<SecondLayerEntry> = Vec::new();
for c in 0u8..128 {
if root_ref.children[c as usize].is_none() {
continue;
}
let first_child = root_ref.children[c as usize].as_ref().unwrap();
let first_child_ref = first_child.borrow();
let mut tally: u8 = 0;
for cc in 0u8..128 {
if first_child_ref.children[cc as usize].is_none() {
continue;
}
let second_child = first_child_ref.children[cc as usize].as_ref().unwrap();
let second_child_ref = second_child.borrow();
let key = Rc::as_ptr(second_child);
let ci = child_indexes.get(&key).copied().unwrap_or(0);
let children_len = second_child_ref.num_direct_children();
second_layer.push(SecondLayerEntry {
child_index: ci,
number: tally,
children_len,
end_of_word: second_child_ref.is_terminal,
});
tally = tally.wrapping_add(second_child_ref.number as u8);
}
}
drop(root_ref);
// Generate output file.
let out_dir = env::var("OUT_DIR").unwrap();
let out_path = Path::new(&out_dir).join("named_character_references.rs");
let mut out = String::new();
out.push_str("// Auto-generated by build.rs -- do not edit!\n\n");
// Second codepoint enum.
out.push_str("#[derive(Clone, Copy, PartialEq, Eq)]\n");
out.push_str("#[repr(u8)]\n");
out.push_str("pub enum SecondCodepoint {\n");
out.push_str(" None = 0,\n");
out.push_str(" CombiningLongSolidusOverlay = 1,\n");
out.push_str(" CombiningLongVerticalLineOverlay = 2,\n");
out.push_str(" HairSpace = 3,\n");
out.push_str(" CombiningDoubleLowLine = 4,\n");
out.push_str(" CombiningReverseSolidusOverlay = 5,\n");
out.push_str(" VariationSelector1 = 6,\n");
out.push_str(" LatinSmallLetterJ = 7,\n");
out.push_str(" CombiningMacronBelow = 8,\n");
out.push_str("}\n\n");
out.push_str("impl SecondCodepoint {\n");
out.push_str(" pub fn value(self) -> u32 {\n");
out.push_str(" match self {\n");
out.push_str(" SecondCodepoint::None => 0,\n");
out.push_str(" SecondCodepoint::CombiningLongSolidusOverlay => 0x0338,\n");
out.push_str(" SecondCodepoint::CombiningLongVerticalLineOverlay => 0x20D2,\n");
out.push_str(" SecondCodepoint::HairSpace => 0x200A,\n");
out.push_str(" SecondCodepoint::CombiningDoubleLowLine => 0x0333,\n");
out.push_str(" SecondCodepoint::CombiningReverseSolidusOverlay => 0x20E5,\n");
out.push_str(" SecondCodepoint::VariationSelector1 => 0xFE00,\n");
out.push_str(" SecondCodepoint::LatinSmallLetterJ => 0x006A,\n");
out.push_str(" SecondCodepoint::CombiningMacronBelow => 0x0331,\n");
out.push_str(" }\n");
out.push_str(" }\n");
out.push_str("}\n\n");
// Struct definitions.
out.push_str("#[derive(Clone, Copy)]\n");
out.push_str("pub struct DafsaNode {\n");
out.push_str(" pub character: u8,\n");
out.push_str(" pub number: u8,\n");
out.push_str(" pub end_of_word: bool,\n");
out.push_str(" pub child_index: u16,\n");
out.push_str(" pub children_len: u8,\n");
out.push_str("}\n\n");
out.push_str("#[derive(Clone, Copy)]\n");
out.push_str("pub struct SecondLayerNode {\n");
out.push_str(" pub child_index: u16,\n");
out.push_str(" pub number: u8,\n");
out.push_str(" pub children_len: u8,\n");
out.push_str(" pub end_of_word: bool,\n");
out.push_str("}\n\n");
// Codepoints lookup table.
out.push_str(&format!(
"pub static CODEPOINTS_LOOKUP: [(u32, SecondCodepoint); {}] = [\n",
index_to_codepoints.len()
));
for (first, second) in &index_to_codepoints {
let variant = second_codepoint_variant(*second);
out.push_str(&format!(" ({first:#06X}, SecondCodepoint::{variant}),\n"));
}
out.push_str("];\n\n");
// DAFSA nodes array (with dummy node at index 0).
out.push_str(&format!(
"pub static DAFSA_NODES: [DafsaNode; {}] = [\n",
node_data.len() + 1
));
out.push_str(" DafsaNode { character: 0, number: 0, end_of_word: false, child_index: 0, children_len: 0 },\n");
for nd in &node_data {
out.push_str(&format!(
" DafsaNode {{ character: b'{}', number: {}, end_of_word: {}, child_index: {}, children_len: {} }},\n",
escape_byte(nd.character),
nd.number,
nd.end_of_word,
nd.child_index,
nd.children_len
));
}
out.push_str("];\n\n");
// First layer.
out.push_str(&format!("pub static FIRST_LAYER: [u16; {}] = [\n", first_layer.len()));
for n in &first_layer {
out.push_str(&format!(" {n},\n"));
}
out.push_str("];\n\n");
// First-to-second layer links.
out.push_str(&format!(
"pub static FIRST_TO_SECOND_LAYER: [(u64, u16); {}] = [\n",
first_to_second_layer.len()
));
for (mask, offset) in &first_to_second_layer {
out.push_str(&format!(" ({mask:#018X}, {offset}),\n"));
}
out.push_str("];\n\n");
// Second layer nodes.
out.push_str(&format!(
"pub static SECOND_LAYER: [SecondLayerNode; {}] = [\n",
second_layer.len()
));
for sl in &second_layer {
out.push_str(&format!(
" SecondLayerNode {{ child_index: {}, number: {}, children_len: {}, end_of_word: {} }},\n",
sl.child_index, sl.number, sl.children_len, sl.end_of_word
));
}
out.push_str("];\n\n");
// Total entity count.
out.push_str(&format!("pub const ENTITY_COUNT: usize = {};\n", entities.len()));
fs::write(&out_path, &out).expect("Failed to write generated file");
}
/// Extract the string literal from `__ENUMERATE_FOO(ident, "string")` macro
/// invocations in a C++ header.
fn parse_enumerate_macro(source: &str, macro_name: &str) -> Vec<String> {
let needle = format!("{macro_name}(");
let mut out = Vec::new();
for line in source.lines() {
let Some(idx) = line.find(&needle) else {
continue;
};
let rest = &line[idx + needle.len()..];
// Take the second argument, which is the quoted string literal.
let Some(first_quote) = rest.find('"') else {
continue;
};
let after = &rest[first_quote + 1..];
let Some(end_quote) = after.find('"') else {
continue;
};
out.push(after[..end_quote].to_string());
}
out
}
/// Emit a Rust source file with two const byte-slice arrays and two lookup
/// functions that dispatch on length and then on the exact bytes. rustc
/// compiles this pattern to a jump table + direct memcmp, which beats a
/// HashMap lookup with a cryptographic default hasher by a wide margin for
/// the small, fixed set of HTML names.
fn emit_interned_names(out_path: &Path, tag_names: &[String], attr_names: &[String]) {
let mut out = String::new();
out.push_str("// Auto-generated by build.rs from TagNames.h / AttributeNames.h.\n");
out.push_str("// Do not edit by hand.\n\n");
out.push_str("pub const INTERNED_TAG_NAMES: &[&[u8]] = &[\n");
for name in tag_names {
out.push_str(&format!(" b\"{}\",\n", name));
}
out.push_str("];\n\n");
out.push_str("pub const INTERNED_ATTR_NAMES: &[&[u8]] = &[\n");
for name in attr_names {
out.push_str(&format!(" b\"{}\",\n", name));
}
out.push_str("];\n\n");
emit_lookup_fn(&mut out, "lookup_tag_name_generated", tag_names);
emit_lookup_fn(&mut out, "lookup_attr_name_generated", attr_names);
fs::write(out_path, out).expect("Failed to write interned_names_generated.rs");
}
fn emit_lookup_fn(out: &mut String, fn_name: &str, names: &[String]) {
// Group names by byte length so the outer dispatch can be a single match.
let mut by_length: std::collections::BTreeMap<usize, Vec<(usize, &String)>> = std::collections::BTreeMap::new();
for (i, name) in names.iter().enumerate() {
by_length.entry(name.len()).or_default().push((i, name));
}
out.push_str(&format!("#[inline]\npub fn {fn_name}(bytes: &[u8]) -> u16 {{\n"));
out.push_str(" match bytes.len() {\n");
for (length, entries) in &by_length {
out.push_str(&format!(" {length} => match bytes {{\n"));
for (index, name) in entries {
// id is 1-based.
let id = index + 1;
out.push_str(&format!(" b\"{name}\" => {id},\n"));
}
out.push_str(" _ => 0,\n");
out.push_str(" },\n");
}
out.push_str(" _ => 0,\n");
out.push_str(" }\n");
out.push_str("}\n\n");
}
fn escape_byte(b: u8) -> String {
if b == b'\'' {
"\\'".to_string()
} else if b == b'\\' {
"\\\\".to_string()
} else if b.is_ascii_graphic() || b == b' ' {
String::from(b as char)
} else {
format!("\\x{b:02X}")
}
}
fn second_codepoint_variant(cp: u32) -> &'static str {
match cp {
0 => "None",
0x0338 => "CombiningLongSolidusOverlay",
0x20D2 => "CombiningLongVerticalLineOverlay",
0x200A => "HairSpace",
0x0333 => "CombiningDoubleLowLine",
0x20E5 => "CombiningReverseSolidusOverlay",
0xFE00 => "VariationSelector1",
0x006A => "LatinSmallLetterJ",
0x0331 => "CombiningMacronBelow",
_ => panic!("Unknown second codepoint: {cp:#X}"),
}
}
// Minimal JSON parser for Entities.json.
fn parse_entities_json(json: &str) -> Vec<(String, u32, u32)> {
let mut entities = Vec::new();
let bytes = json.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len && bytes[i] != b'{' {
i += 1;
}
i += 1;
loop {
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
if i >= len || bytes[i] == b'}' {
break;
}
if bytes[i] == b',' {
i += 1;
continue;
}
// Parse key.
assert_eq!(bytes[i], b'"');
i += 1;
let key_start = i;
while i < len && bytes[i] != b'"' {
if bytes[i] == b'\\' {
i += 1;
}
i += 1;
}
let key = std::str::from_utf8(&bytes[key_start..i]).unwrap().to_string();
i += 1;
// Skip ':'.
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
assert_eq!(bytes[i], b':');
i += 1;
// Skip to inner '{'.
while i < len && bytes[i] != b'{' {
i += 1;
}
i += 1;
// Parse inner object for "codepoints".
let mut codepoints: Vec<u32> = Vec::new();
while i < len && bytes[i] != b'}' {
if bytes[i] == b'"' {
i += 1;
let field_start = i;
while i < len && bytes[i] != b'"' {
i += 1;
}
let field_name = std::str::from_utf8(&bytes[field_start..i]).unwrap();
i += 1;
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
assert_eq!(bytes[i], b':');
i += 1;
while i < len && bytes[i].is_ascii_whitespace() {
i += 1;
}
if field_name == "codepoints" {
assert_eq!(bytes[i], b'[');
i += 1;
loop {
while i < len && (bytes[i].is_ascii_whitespace() || bytes[i] == b',') {
i += 1;
}
if i >= len || bytes[i] == b']' {
i += 1;
break;
}
let num_start = i;
while i < len && bytes[i].is_ascii_digit() {
i += 1;
}
let num_str = std::str::from_utf8(&bytes[num_start..i]).unwrap();
codepoints.push(num_str.parse().unwrap());
}
} else {
// Skip value.
if bytes[i] == b'"' {
i += 1;
while i < len && bytes[i] != b'"' {
if bytes[i] == b'\\' {
i += 1;
}
i += 1;
}
i += 1;
} else if bytes[i] == b'[' {
let mut depth = 1;
i += 1;
while i < len && depth > 0 {
if bytes[i] == b'[' {
depth += 1;
} else if bytes[i] == b']' {
depth -= 1;
}
i += 1;
}
}
}
} else {
i += 1;
}
}
i += 1;
let name = key.strip_prefix('&').unwrap_or(&key).to_string();
let first = codepoints.first().copied().unwrap_or(0);
let second = if codepoints.len() > 1 { codepoints[1] } else { 0 };
entities.push((name, first, second));
}
entities.sort_by(|a, b| a.0.cmp(&b.0));
entities
}
// DAFSA builder using Rc<RefCell<Node>> for shared ownership.
type NodeRc = Rc<RefCell<Node>>;
struct Node {
children: Vec<Option<NodeRc>>, // 128 slots
is_terminal: bool,
number: u16,
}
struct SecondLayerEntry {
child_index: u16,
number: u8,
children_len: u8,
end_of_word: bool,
}
struct NodeData {
character: u8,
number: u8,
end_of_word: bool,
child_index: u16,
children_len: u8,
}
impl Node {
fn new_rc() -> NodeRc {
Rc::new(RefCell::new(Node {
children: (0..128).map(|_| Option::None).collect(),
is_terminal: false,
number: 0,
}))
}
fn calc_numbers(&mut self) {
self.number = if self.is_terminal { 1 } else { 0 };
for child in self.children.iter().flatten() {
child.borrow_mut().calc_numbers();
self.number += child.borrow().number;
}
}
fn num_direct_children(&self) -> u8 {
let mut n = 0u8;
for c in &self.children {
if c.is_some() {
n += 1;
}
}
n
}
fn get_ascii_alphabetic_bit_mask(&self) -> u64 {
let mut mask = 0u64;
for i in 0..128u8 {
if self.children[i as usize].is_some() {
mask |= 1u64 << ascii_alphabetic_to_index(i);
}
}
mask
}
/// Hash based on child identities (Rc pointer) and terminal status.
fn structure_hash(&self) -> u64 {
let mut h: u64 = if self.is_terminal { 1 } else { 0 };
for (i, child) in self.children.iter().enumerate() {
if let Some(c) = child {
h = h.wrapping_mul(31).wrapping_add(i as u64);
h = h.wrapping_mul(31).wrapping_add(Rc::as_ptr(c) as u64);
}
}
h
}
/// Check structural equality via Rc pointer identity.
fn structure_eq(&self, other: &Node) -> bool {
if self.is_terminal != other.is_terminal {
return false;
}
for i in 0..128 {
match (&self.children[i], &other.children[i]) {
(None, None) => {}
(Some(a), Some(b)) => {
if !Rc::ptr_eq(a, b) {
return false;
}
}
_ => return false,
}
}
true
}
}
fn ascii_alphabetic_to_index(c: u8) -> u8 {
if c <= b'Z' { c - b'A' } else { c - b'a' + 26 }
}
struct UncheckedNode {
parent: NodeRc,
character: u8,
}
struct DafsaBuilder {
root: NodeRc,
minimized_nodes: HashMap<u64, Vec<NodeRc>>,
unchecked_nodes: Vec<UncheckedNode>,
previous_word: String,
}
impl DafsaBuilder {
fn new() -> Self {
DafsaBuilder {
root: Node::new_rc(),
minimized_nodes: HashMap::new(),
unchecked_nodes: Vec::new(),
previous_word: String::new(),
}
}
fn insert(&mut self, word: &str) {
assert!(
word > self.previous_word.as_str(),
"Words must be inserted in sorted order: '{word}' <= '{}'",
self.previous_word
);
let common_prefix_len = word
.bytes()
.zip(self.previous_word.bytes())
.take_while(|(a, b)| a == b)
.count();
self.minimize(common_prefix_len);
let node: NodeRc = if self.unchecked_nodes.is_empty() {
Rc::clone(&self.root)
} else {
let last = &self.unchecked_nodes[self.unchecked_nodes.len() - 1];
let parent = last.parent.borrow();
Rc::clone(parent.children[last.character as usize].as_ref().unwrap())
};
let remaining = &word[common_prefix_len..];
let mut current = node;
for c in remaining.bytes() {
let new_child = Node::new_rc();
{
let mut current_ref = current.borrow_mut();
assert!(current_ref.children[c as usize].is_none());
current_ref.children[c as usize] = Some(Rc::clone(&new_child));
}
self.unchecked_nodes.push(UncheckedNode {
parent: Rc::clone(&current),
character: c,
});
current = new_child;
}
current.borrow_mut().is_terminal = true;
self.previous_word = word.to_string();
}
fn minimize(&mut self, down_to: usize) {
while self.unchecked_nodes.len() > down_to {
let unchecked = self.unchecked_nodes.pop().unwrap();
let parent = &unchecked.parent;
let child = {
let parent_ref = parent.borrow();
Rc::clone(parent_ref.children[unchecked.character as usize].as_ref().unwrap())
};
let hash = child.borrow().structure_hash();
let mut found_replacement: Option<NodeRc> = Option::None;
if let Some(bucket) = self.minimized_nodes.get(&hash) {
for existing in bucket {
if child.borrow().structure_eq(&existing.borrow()) {
found_replacement = Some(Rc::clone(existing));
break;
}
}
}
if let Some(replacement) = found_replacement {
parent.borrow_mut().children[unchecked.character as usize] = Some(replacement);
} else {
self.minimized_nodes.entry(hash).or_default().push(Rc::clone(&child));
}
}
}
fn calc_numbers(&mut self) {
self.root.borrow_mut().calc_numbers();
}
fn get_unique_index(&self, word: &str) -> Option<usize> {
let mut index: usize = 0;
let mut current = Rc::clone(&self.root);
for c in word.bytes() {
let next = {
let node = current.borrow();
let child = node.children[c as usize].as_ref()?;
for sibling_c in 0u8..128 {
if let Some(sibling) = &node.children[sibling_c as usize]
&& sibling_c < c
{
index += sibling.borrow().number as usize;
}
}
Rc::clone(child)
};
if next.borrow().is_terminal {
index += 1;
}
current = next;
}
Some(index)
}
}
fn queue_children(
node: &NodeRc,
queue: &mut Vec<NodeRc>,
child_indexes: &mut HashMap<*const RefCell<Node>, u16>,
first_available_index: u16,
) -> u16 {
let mut current = first_available_index;
let node_ref = node.borrow();
for c in 0..128u8 {
if let Some(child) = &node_ref.children[c as usize] {
let key = Rc::as_ptr(child);
if let std::collections::hash_map::Entry::Vacant(entry) = child_indexes.entry(key) {
let num_children = child.borrow().num_direct_children();
if num_children > 0 {
entry.insert(current);
current += num_children as u16;
}
queue.push(Rc::clone(child));
}
}
}
current
}
fn write_children_data(
node: &NodeRc,
node_data: &mut Vec<NodeData>,
queue: &mut Vec<NodeRc>,
child_indexes: &mut HashMap<*const RefCell<Node>, u16>,
first_available_index: u16,
) -> u16 {
let mut current = first_available_index;
let mut unique_index_tally: u8 = 0;
let node_ref = node.borrow();
for c in 0..128u8 {
if let Some(child) = &node_ref.children[c as usize] {
let key = Rc::as_ptr(child);
let child_ref = child.borrow();
let num_children = child_ref.num_direct_children();
if let std::collections::hash_map::Entry::Vacant(entry) = child_indexes.entry(key) {
if num_children > 0 {
entry.insert(current);
current += num_children as u16;
}
queue.push(Rc::clone(child));
}
node_data.push(NodeData {
character: c,
number: unique_index_tally,
end_of_word: child_ref.is_terminal,
child_index: child_indexes.get(&key).copied().unwrap_or(0),
children_len: num_children,
});
unique_index_tally = unique_index_tally.wrapping_add(child_ref.number as u8);
}
}
current
}