mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2026-06-24 18:30:23 +00:00
821 lines
28 KiB
Rust
821 lines
28 KiB
Rust
/*
|
|
* Copyright (c) 2026-present, the Ladybird developers.
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
//! Build script that generates a DAFSA (Deterministic Acyclic Finite State Automaton)
|
|
//! for named character reference matching.
|
|
|
|
use std::cell::RefCell;
|
|
use std::collections::HashMap;
|
|
use std::env;
|
|
use std::fs;
|
|
use std::path::Path;
|
|
use std::path::PathBuf;
|
|
use std::rc::Rc;
|
|
|
|
const FFI_HEADER: &str = "HTMLTokenizerRustFFI.h";
|
|
|
|
fn main() {
|
|
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
|
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
|
|
|
|
println!("cargo:rerun-if-changed=build.rs");
|
|
println!("cargo:rerun-if-changed=cbindgen.toml");
|
|
println!("cargo:rerun-if-changed=src");
|
|
println!("cargo:rerun-if-env-changed=FFI_OUTPUT_DIR");
|
|
|
|
let ffi_out_dir = env::var("FFI_OUTPUT_DIR")
|
|
.map(PathBuf::from)
|
|
.unwrap_or_else(|_| out_dir.clone());
|
|
|
|
cbindgen::generate(&manifest_dir).map_or_else(
|
|
|error| match error {
|
|
cbindgen::Error::ParseSyntaxError { .. } => {}
|
|
e => panic!("{e:?}"),
|
|
},
|
|
|bindings| {
|
|
bindings.write_to_file(out_dir.join(FFI_HEADER));
|
|
if ffi_out_dir != out_dir {
|
|
bindings.write_to_file(ffi_out_dir.join(FFI_HEADER));
|
|
}
|
|
},
|
|
);
|
|
|
|
// Generate interned name tables from the existing C++ headers.
|
|
let tag_names_header = Path::new(&manifest_dir).join("../../TagNames.h");
|
|
let attr_names_header = Path::new(&manifest_dir).join("../../AttributeNames.h");
|
|
println!("cargo:rerun-if-changed={}", tag_names_header.display());
|
|
println!("cargo:rerun-if-changed={}", attr_names_header.display());
|
|
let tag_names = parse_enumerate_macro(
|
|
&fs::read_to_string(&tag_names_header).expect("Failed to read TagNames.h"),
|
|
"__ENUMERATE_HTML_TAG",
|
|
);
|
|
let attr_names = parse_enumerate_macro(
|
|
&fs::read_to_string(&attr_names_header).expect("Failed to read AttributeNames.h"),
|
|
"__ENUMERATE_HTML_ATTRIBUTE",
|
|
);
|
|
emit_interned_names(&out_dir.join("interned_names_generated.rs"), &tag_names, &attr_names);
|
|
|
|
let json_path = Path::new(&manifest_dir).join("../Entities.json");
|
|
println!("cargo:rerun-if-changed={}", json_path.display());
|
|
|
|
let json_str = fs::read_to_string(&json_path).expect("Failed to read Entities.json");
|
|
let entities = parse_entities_json(&json_str);
|
|
|
|
// Build DAFSA.
|
|
let mut builder = DafsaBuilder::new();
|
|
for (name, _, _) in &entities {
|
|
builder.insert(name);
|
|
}
|
|
builder.minimize(0);
|
|
builder.calc_numbers();
|
|
|
|
// Verify minimal perfect hashing (no collisions).
|
|
let mut seen: Vec<bool> = vec![false; entities.len() + 1];
|
|
for (name, _, _) in &entities {
|
|
let idx = builder.get_unique_index(name).unwrap();
|
|
assert!(!seen[idx], "Hash collision at index {idx} for '{name}'");
|
|
seen[idx] = true;
|
|
}
|
|
|
|
// Build codepoints lookup table indexed by unique_index.
|
|
let mut index_to_codepoints = vec![(0u32, 0u32); entities.len()];
|
|
for (name, first, second) in &entities {
|
|
let idx = builder.get_unique_index(name).unwrap();
|
|
index_to_codepoints[idx - 1] = (*first, *second);
|
|
}
|
|
|
|
// Extract DAFSA layers.
|
|
let root = &builder.root;
|
|
let root_ref = root.borrow();
|
|
|
|
let mut first_layer: Vec<u16> = Vec::new();
|
|
let mut first_to_second_layer: Vec<(u64, u16)> = Vec::new();
|
|
|
|
let mut first_layer_tally: u16 = 0;
|
|
let mut second_layer_offset: u16 = 0;
|
|
|
|
for c in 0u8..128 {
|
|
if root_ref.children[c as usize].is_none() {
|
|
continue;
|
|
}
|
|
assert!(c.is_ascii_alphabetic());
|
|
let child = root_ref.children[c as usize].as_ref().unwrap();
|
|
let child_ref = child.borrow();
|
|
|
|
first_layer.push(first_layer_tally);
|
|
first_layer_tally += child_ref.number;
|
|
|
|
let mask = child_ref.get_ascii_alphabetic_bit_mask();
|
|
first_to_second_layer.push((mask, second_layer_offset));
|
|
second_layer_offset += child_ref.num_direct_children() as u16;
|
|
}
|
|
assert_eq!(first_layer.len(), 52);
|
|
|
|
// BFS to build DAFSA node array.
|
|
// Following the C++ write_node_data three-phase approach:
|
|
type NodePtr = Rc<RefCell<Node>>;
|
|
let mut queue: Vec<NodePtr> = Vec::new();
|
|
let mut child_indexes: HashMap<*const RefCell<Node>, u16> = HashMap::new();
|
|
|
|
// Phase 1: Queue root's children (first-layer nodes = 52 A-Z/a-z).
|
|
// This assigns temporary child_indexes for first-layer children.
|
|
queue_children(root, &mut queue, &mut child_indexes, 1);
|
|
|
|
// Phase 2: Clear indexes and re-process. For each first-layer child,
|
|
// queue ITS children (second-layer nodes) and assign their child_indexes.
|
|
child_indexes.clear();
|
|
let mut first_available_index: u16 = 1; // 0 is reserved (dummy node)
|
|
let first_layer_count = queue.len();
|
|
for i in 0..first_layer_count {
|
|
let node = Rc::clone(&queue[i]);
|
|
first_available_index = queue_children(&node, &mut queue, &mut child_indexes, first_available_index);
|
|
}
|
|
// Remove first-layer nodes from queue, keep only second-layer+ nodes.
|
|
let second_layer_nodes: Vec<NodePtr> = queue.drain(first_layer_count..).collect();
|
|
queue.clear();
|
|
queue.extend(second_layer_nodes);
|
|
|
|
// Phase 3: BFS remaining nodes, writing node data.
|
|
let mut node_data: Vec<NodeData> = Vec::new();
|
|
let mut qi = 0;
|
|
#[allow(unused_assignments)]
|
|
while qi < queue.len() {
|
|
let node = Rc::clone(&queue[qi]);
|
|
qi += 1;
|
|
first_available_index = write_children_data(
|
|
&node,
|
|
&mut node_data,
|
|
&mut queue,
|
|
&mut child_indexes,
|
|
first_available_index,
|
|
);
|
|
}
|
|
|
|
// Build second_layer entries with child_indexes from phase 2.
|
|
let mut second_layer: Vec<SecondLayerEntry> = Vec::new();
|
|
for c in 0u8..128 {
|
|
if root_ref.children[c as usize].is_none() {
|
|
continue;
|
|
}
|
|
let first_child = root_ref.children[c as usize].as_ref().unwrap();
|
|
let first_child_ref = first_child.borrow();
|
|
let mut tally: u8 = 0;
|
|
for cc in 0u8..128 {
|
|
if first_child_ref.children[cc as usize].is_none() {
|
|
continue;
|
|
}
|
|
let second_child = first_child_ref.children[cc as usize].as_ref().unwrap();
|
|
let second_child_ref = second_child.borrow();
|
|
let key = Rc::as_ptr(second_child);
|
|
let ci = child_indexes.get(&key).copied().unwrap_or(0);
|
|
let children_len = second_child_ref.num_direct_children();
|
|
second_layer.push(SecondLayerEntry {
|
|
child_index: ci,
|
|
number: tally,
|
|
children_len,
|
|
end_of_word: second_child_ref.is_terminal,
|
|
});
|
|
tally = tally.wrapping_add(second_child_ref.number as u8);
|
|
}
|
|
}
|
|
drop(root_ref);
|
|
|
|
// Generate output file.
|
|
let out_dir = env::var("OUT_DIR").unwrap();
|
|
let out_path = Path::new(&out_dir).join("named_character_references.rs");
|
|
let mut out = String::new();
|
|
|
|
out.push_str("// Auto-generated by build.rs -- do not edit!\n\n");
|
|
|
|
// Second codepoint enum.
|
|
out.push_str("#[derive(Clone, Copy, PartialEq, Eq)]\n");
|
|
out.push_str("#[repr(u8)]\n");
|
|
out.push_str("pub enum SecondCodepoint {\n");
|
|
out.push_str(" None = 0,\n");
|
|
out.push_str(" CombiningLongSolidusOverlay = 1,\n");
|
|
out.push_str(" CombiningLongVerticalLineOverlay = 2,\n");
|
|
out.push_str(" HairSpace = 3,\n");
|
|
out.push_str(" CombiningDoubleLowLine = 4,\n");
|
|
out.push_str(" CombiningReverseSolidusOverlay = 5,\n");
|
|
out.push_str(" VariationSelector1 = 6,\n");
|
|
out.push_str(" LatinSmallLetterJ = 7,\n");
|
|
out.push_str(" CombiningMacronBelow = 8,\n");
|
|
out.push_str("}\n\n");
|
|
|
|
out.push_str("impl SecondCodepoint {\n");
|
|
out.push_str(" pub fn value(self) -> u32 {\n");
|
|
out.push_str(" match self {\n");
|
|
out.push_str(" SecondCodepoint::None => 0,\n");
|
|
out.push_str(" SecondCodepoint::CombiningLongSolidusOverlay => 0x0338,\n");
|
|
out.push_str(" SecondCodepoint::CombiningLongVerticalLineOverlay => 0x20D2,\n");
|
|
out.push_str(" SecondCodepoint::HairSpace => 0x200A,\n");
|
|
out.push_str(" SecondCodepoint::CombiningDoubleLowLine => 0x0333,\n");
|
|
out.push_str(" SecondCodepoint::CombiningReverseSolidusOverlay => 0x20E5,\n");
|
|
out.push_str(" SecondCodepoint::VariationSelector1 => 0xFE00,\n");
|
|
out.push_str(" SecondCodepoint::LatinSmallLetterJ => 0x006A,\n");
|
|
out.push_str(" SecondCodepoint::CombiningMacronBelow => 0x0331,\n");
|
|
out.push_str(" }\n");
|
|
out.push_str(" }\n");
|
|
out.push_str("}\n\n");
|
|
|
|
// Struct definitions.
|
|
out.push_str("#[derive(Clone, Copy)]\n");
|
|
out.push_str("pub struct DafsaNode {\n");
|
|
out.push_str(" pub character: u8,\n");
|
|
out.push_str(" pub number: u8,\n");
|
|
out.push_str(" pub end_of_word: bool,\n");
|
|
out.push_str(" pub child_index: u16,\n");
|
|
out.push_str(" pub children_len: u8,\n");
|
|
out.push_str("}\n\n");
|
|
|
|
out.push_str("#[derive(Clone, Copy)]\n");
|
|
out.push_str("pub struct SecondLayerNode {\n");
|
|
out.push_str(" pub child_index: u16,\n");
|
|
out.push_str(" pub number: u8,\n");
|
|
out.push_str(" pub children_len: u8,\n");
|
|
out.push_str(" pub end_of_word: bool,\n");
|
|
out.push_str("}\n\n");
|
|
|
|
// Codepoints lookup table.
|
|
out.push_str(&format!(
|
|
"pub static CODEPOINTS_LOOKUP: [(u32, SecondCodepoint); {}] = [\n",
|
|
index_to_codepoints.len()
|
|
));
|
|
for (first, second) in &index_to_codepoints {
|
|
let variant = second_codepoint_variant(*second);
|
|
out.push_str(&format!(" ({first:#06X}, SecondCodepoint::{variant}),\n"));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
// DAFSA nodes array (with dummy node at index 0).
|
|
out.push_str(&format!(
|
|
"pub static DAFSA_NODES: [DafsaNode; {}] = [\n",
|
|
node_data.len() + 1
|
|
));
|
|
out.push_str(" DafsaNode { character: 0, number: 0, end_of_word: false, child_index: 0, children_len: 0 },\n");
|
|
for nd in &node_data {
|
|
out.push_str(&format!(
|
|
" DafsaNode {{ character: b'{}', number: {}, end_of_word: {}, child_index: {}, children_len: {} }},\n",
|
|
escape_byte(nd.character),
|
|
nd.number,
|
|
nd.end_of_word,
|
|
nd.child_index,
|
|
nd.children_len
|
|
));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
// First layer.
|
|
out.push_str(&format!("pub static FIRST_LAYER: [u16; {}] = [\n", first_layer.len()));
|
|
for n in &first_layer {
|
|
out.push_str(&format!(" {n},\n"));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
// First-to-second layer links.
|
|
out.push_str(&format!(
|
|
"pub static FIRST_TO_SECOND_LAYER: [(u64, u16); {}] = [\n",
|
|
first_to_second_layer.len()
|
|
));
|
|
for (mask, offset) in &first_to_second_layer {
|
|
out.push_str(&format!(" ({mask:#018X}, {offset}),\n"));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
// Second layer nodes.
|
|
out.push_str(&format!(
|
|
"pub static SECOND_LAYER: [SecondLayerNode; {}] = [\n",
|
|
second_layer.len()
|
|
));
|
|
for sl in &second_layer {
|
|
out.push_str(&format!(
|
|
" SecondLayerNode {{ child_index: {}, number: {}, children_len: {}, end_of_word: {} }},\n",
|
|
sl.child_index, sl.number, sl.children_len, sl.end_of_word
|
|
));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
// Total entity count.
|
|
out.push_str(&format!("pub const ENTITY_COUNT: usize = {};\n", entities.len()));
|
|
|
|
fs::write(&out_path, &out).expect("Failed to write generated file");
|
|
}
|
|
|
|
/// Extract the string literal from `__ENUMERATE_FOO(ident, "string")` macro
|
|
/// invocations in a C++ header.
|
|
fn parse_enumerate_macro(source: &str, macro_name: &str) -> Vec<String> {
|
|
let needle = format!("{macro_name}(");
|
|
let mut out = Vec::new();
|
|
for line in source.lines() {
|
|
let Some(idx) = line.find(&needle) else {
|
|
continue;
|
|
};
|
|
let rest = &line[idx + needle.len()..];
|
|
// Take the second argument, which is the quoted string literal.
|
|
let Some(first_quote) = rest.find('"') else {
|
|
continue;
|
|
};
|
|
let after = &rest[first_quote + 1..];
|
|
let Some(end_quote) = after.find('"') else {
|
|
continue;
|
|
};
|
|
out.push(after[..end_quote].to_string());
|
|
}
|
|
out
|
|
}
|
|
|
|
/// Emit a Rust source file with two const byte-slice arrays and two lookup
|
|
/// functions that dispatch on length and then on the exact bytes. rustc
|
|
/// compiles this pattern to a jump table + direct memcmp, which beats a
|
|
/// HashMap lookup with a cryptographic default hasher by a wide margin for
|
|
/// the small, fixed set of HTML names.
|
|
fn emit_interned_names(out_path: &Path, tag_names: &[String], attr_names: &[String]) {
|
|
let mut out = String::new();
|
|
out.push_str("// Auto-generated by build.rs from TagNames.h / AttributeNames.h.\n");
|
|
out.push_str("// Do not edit by hand.\n\n");
|
|
|
|
out.push_str("pub const INTERNED_TAG_NAMES: &[&[u8]] = &[\n");
|
|
for name in tag_names {
|
|
out.push_str(&format!(" b\"{}\",\n", name));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
out.push_str("pub const INTERNED_ATTR_NAMES: &[&[u8]] = &[\n");
|
|
for name in attr_names {
|
|
out.push_str(&format!(" b\"{}\",\n", name));
|
|
}
|
|
out.push_str("];\n\n");
|
|
|
|
emit_lookup_fn(&mut out, "lookup_tag_name_generated", tag_names);
|
|
emit_lookup_fn(&mut out, "lookup_attr_name_generated", attr_names);
|
|
|
|
fs::write(out_path, out).expect("Failed to write interned_names_generated.rs");
|
|
}
|
|
|
|
fn emit_lookup_fn(out: &mut String, fn_name: &str, names: &[String]) {
|
|
// Group names by byte length so the outer dispatch can be a single match.
|
|
let mut by_length: std::collections::BTreeMap<usize, Vec<(usize, &String)>> = std::collections::BTreeMap::new();
|
|
for (i, name) in names.iter().enumerate() {
|
|
by_length.entry(name.len()).or_default().push((i, name));
|
|
}
|
|
|
|
out.push_str(&format!("#[inline]\npub fn {fn_name}(bytes: &[u8]) -> u16 {{\n"));
|
|
out.push_str(" match bytes.len() {\n");
|
|
for (length, entries) in &by_length {
|
|
out.push_str(&format!(" {length} => match bytes {{\n"));
|
|
for (index, name) in entries {
|
|
// id is 1-based.
|
|
let id = index + 1;
|
|
out.push_str(&format!(" b\"{name}\" => {id},\n"));
|
|
}
|
|
out.push_str(" _ => 0,\n");
|
|
out.push_str(" },\n");
|
|
}
|
|
out.push_str(" _ => 0,\n");
|
|
out.push_str(" }\n");
|
|
out.push_str("}\n\n");
|
|
}
|
|
|
|
fn escape_byte(b: u8) -> String {
|
|
if b == b'\'' {
|
|
"\\'".to_string()
|
|
} else if b == b'\\' {
|
|
"\\\\".to_string()
|
|
} else if b.is_ascii_graphic() || b == b' ' {
|
|
String::from(b as char)
|
|
} else {
|
|
format!("\\x{b:02X}")
|
|
}
|
|
}
|
|
|
|
fn second_codepoint_variant(cp: u32) -> &'static str {
|
|
match cp {
|
|
0 => "None",
|
|
0x0338 => "CombiningLongSolidusOverlay",
|
|
0x20D2 => "CombiningLongVerticalLineOverlay",
|
|
0x200A => "HairSpace",
|
|
0x0333 => "CombiningDoubleLowLine",
|
|
0x20E5 => "CombiningReverseSolidusOverlay",
|
|
0xFE00 => "VariationSelector1",
|
|
0x006A => "LatinSmallLetterJ",
|
|
0x0331 => "CombiningMacronBelow",
|
|
_ => panic!("Unknown second codepoint: {cp:#X}"),
|
|
}
|
|
}
|
|
|
|
// Minimal JSON parser for Entities.json.
|
|
fn parse_entities_json(json: &str) -> Vec<(String, u32, u32)> {
|
|
let mut entities = Vec::new();
|
|
let bytes = json.as_bytes();
|
|
let len = bytes.len();
|
|
let mut i = 0;
|
|
|
|
while i < len && bytes[i] != b'{' {
|
|
i += 1;
|
|
}
|
|
i += 1;
|
|
|
|
loop {
|
|
while i < len && bytes[i].is_ascii_whitespace() {
|
|
i += 1;
|
|
}
|
|
if i >= len || bytes[i] == b'}' {
|
|
break;
|
|
}
|
|
if bytes[i] == b',' {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
// Parse key.
|
|
assert_eq!(bytes[i], b'"');
|
|
i += 1;
|
|
let key_start = i;
|
|
while i < len && bytes[i] != b'"' {
|
|
if bytes[i] == b'\\' {
|
|
i += 1;
|
|
}
|
|
i += 1;
|
|
}
|
|
let key = std::str::from_utf8(&bytes[key_start..i]).unwrap().to_string();
|
|
i += 1;
|
|
|
|
// Skip ':'.
|
|
while i < len && bytes[i].is_ascii_whitespace() {
|
|
i += 1;
|
|
}
|
|
assert_eq!(bytes[i], b':');
|
|
i += 1;
|
|
|
|
// Skip to inner '{'.
|
|
while i < len && bytes[i] != b'{' {
|
|
i += 1;
|
|
}
|
|
i += 1;
|
|
|
|
// Parse inner object for "codepoints".
|
|
let mut codepoints: Vec<u32> = Vec::new();
|
|
while i < len && bytes[i] != b'}' {
|
|
if bytes[i] == b'"' {
|
|
i += 1;
|
|
let field_start = i;
|
|
while i < len && bytes[i] != b'"' {
|
|
i += 1;
|
|
}
|
|
let field_name = std::str::from_utf8(&bytes[field_start..i]).unwrap();
|
|
i += 1;
|
|
|
|
while i < len && bytes[i].is_ascii_whitespace() {
|
|
i += 1;
|
|
}
|
|
assert_eq!(bytes[i], b':');
|
|
i += 1;
|
|
while i < len && bytes[i].is_ascii_whitespace() {
|
|
i += 1;
|
|
}
|
|
|
|
if field_name == "codepoints" {
|
|
assert_eq!(bytes[i], b'[');
|
|
i += 1;
|
|
loop {
|
|
while i < len && (bytes[i].is_ascii_whitespace() || bytes[i] == b',') {
|
|
i += 1;
|
|
}
|
|
if i >= len || bytes[i] == b']' {
|
|
i += 1;
|
|
break;
|
|
}
|
|
let num_start = i;
|
|
while i < len && bytes[i].is_ascii_digit() {
|
|
i += 1;
|
|
}
|
|
let num_str = std::str::from_utf8(&bytes[num_start..i]).unwrap();
|
|
codepoints.push(num_str.parse().unwrap());
|
|
}
|
|
} else {
|
|
// Skip value.
|
|
if bytes[i] == b'"' {
|
|
i += 1;
|
|
while i < len && bytes[i] != b'"' {
|
|
if bytes[i] == b'\\' {
|
|
i += 1;
|
|
}
|
|
i += 1;
|
|
}
|
|
i += 1;
|
|
} else if bytes[i] == b'[' {
|
|
let mut depth = 1;
|
|
i += 1;
|
|
while i < len && depth > 0 {
|
|
if bytes[i] == b'[' {
|
|
depth += 1;
|
|
} else if bytes[i] == b']' {
|
|
depth -= 1;
|
|
}
|
|
i += 1;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
i += 1;
|
|
}
|
|
}
|
|
i += 1;
|
|
|
|
let name = key.strip_prefix('&').unwrap_or(&key).to_string();
|
|
let first = codepoints.first().copied().unwrap_or(0);
|
|
let second = if codepoints.len() > 1 { codepoints[1] } else { 0 };
|
|
entities.push((name, first, second));
|
|
}
|
|
|
|
entities.sort_by(|a, b| a.0.cmp(&b.0));
|
|
entities
|
|
}
|
|
|
|
// DAFSA builder using Rc<RefCell<Node>> for shared ownership.
|
|
|
|
type NodeRc = Rc<RefCell<Node>>;
|
|
|
|
struct Node {
|
|
children: Vec<Option<NodeRc>>, // 128 slots
|
|
is_terminal: bool,
|
|
number: u16,
|
|
}
|
|
|
|
struct SecondLayerEntry {
|
|
child_index: u16,
|
|
number: u8,
|
|
children_len: u8,
|
|
end_of_word: bool,
|
|
}
|
|
|
|
struct NodeData {
|
|
character: u8,
|
|
number: u8,
|
|
end_of_word: bool,
|
|
child_index: u16,
|
|
children_len: u8,
|
|
}
|
|
|
|
impl Node {
|
|
fn new_rc() -> NodeRc {
|
|
Rc::new(RefCell::new(Node {
|
|
children: (0..128).map(|_| Option::None).collect(),
|
|
is_terminal: false,
|
|
number: 0,
|
|
}))
|
|
}
|
|
|
|
fn calc_numbers(&mut self) {
|
|
self.number = if self.is_terminal { 1 } else { 0 };
|
|
for child in self.children.iter().flatten() {
|
|
child.borrow_mut().calc_numbers();
|
|
self.number += child.borrow().number;
|
|
}
|
|
}
|
|
|
|
fn num_direct_children(&self) -> u8 {
|
|
let mut n = 0u8;
|
|
for c in &self.children {
|
|
if c.is_some() {
|
|
n += 1;
|
|
}
|
|
}
|
|
n
|
|
}
|
|
|
|
fn get_ascii_alphabetic_bit_mask(&self) -> u64 {
|
|
let mut mask = 0u64;
|
|
for i in 0..128u8 {
|
|
if self.children[i as usize].is_some() {
|
|
mask |= 1u64 << ascii_alphabetic_to_index(i);
|
|
}
|
|
}
|
|
mask
|
|
}
|
|
|
|
/// Hash based on child identities (Rc pointer) and terminal status.
|
|
fn structure_hash(&self) -> u64 {
|
|
let mut h: u64 = if self.is_terminal { 1 } else { 0 };
|
|
for (i, child) in self.children.iter().enumerate() {
|
|
if let Some(c) = child {
|
|
h = h.wrapping_mul(31).wrapping_add(i as u64);
|
|
h = h.wrapping_mul(31).wrapping_add(Rc::as_ptr(c) as u64);
|
|
}
|
|
}
|
|
h
|
|
}
|
|
|
|
/// Check structural equality via Rc pointer identity.
|
|
fn structure_eq(&self, other: &Node) -> bool {
|
|
if self.is_terminal != other.is_terminal {
|
|
return false;
|
|
}
|
|
for i in 0..128 {
|
|
match (&self.children[i], &other.children[i]) {
|
|
(None, None) => {}
|
|
(Some(a), Some(b)) => {
|
|
if !Rc::ptr_eq(a, b) {
|
|
return false;
|
|
}
|
|
}
|
|
_ => return false,
|
|
}
|
|
}
|
|
true
|
|
}
|
|
}
|
|
|
|
fn ascii_alphabetic_to_index(c: u8) -> u8 {
|
|
if c <= b'Z' { c - b'A' } else { c - b'a' + 26 }
|
|
}
|
|
|
|
struct UncheckedNode {
|
|
parent: NodeRc,
|
|
character: u8,
|
|
}
|
|
|
|
struct DafsaBuilder {
|
|
root: NodeRc,
|
|
minimized_nodes: HashMap<u64, Vec<NodeRc>>,
|
|
unchecked_nodes: Vec<UncheckedNode>,
|
|
previous_word: String,
|
|
}
|
|
|
|
impl DafsaBuilder {
|
|
fn new() -> Self {
|
|
DafsaBuilder {
|
|
root: Node::new_rc(),
|
|
minimized_nodes: HashMap::new(),
|
|
unchecked_nodes: Vec::new(),
|
|
previous_word: String::new(),
|
|
}
|
|
}
|
|
|
|
fn insert(&mut self, word: &str) {
|
|
assert!(
|
|
word > self.previous_word.as_str(),
|
|
"Words must be inserted in sorted order: '{word}' <= '{}'",
|
|
self.previous_word
|
|
);
|
|
|
|
let common_prefix_len = word
|
|
.bytes()
|
|
.zip(self.previous_word.bytes())
|
|
.take_while(|(a, b)| a == b)
|
|
.count();
|
|
|
|
self.minimize(common_prefix_len);
|
|
|
|
let node: NodeRc = if self.unchecked_nodes.is_empty() {
|
|
Rc::clone(&self.root)
|
|
} else {
|
|
let last = &self.unchecked_nodes[self.unchecked_nodes.len() - 1];
|
|
let parent = last.parent.borrow();
|
|
Rc::clone(parent.children[last.character as usize].as_ref().unwrap())
|
|
};
|
|
|
|
let remaining = &word[common_prefix_len..];
|
|
let mut current = node;
|
|
for c in remaining.bytes() {
|
|
let new_child = Node::new_rc();
|
|
{
|
|
let mut current_ref = current.borrow_mut();
|
|
assert!(current_ref.children[c as usize].is_none());
|
|
current_ref.children[c as usize] = Some(Rc::clone(&new_child));
|
|
}
|
|
self.unchecked_nodes.push(UncheckedNode {
|
|
parent: Rc::clone(¤t),
|
|
character: c,
|
|
});
|
|
current = new_child;
|
|
}
|
|
current.borrow_mut().is_terminal = true;
|
|
|
|
self.previous_word = word.to_string();
|
|
}
|
|
|
|
fn minimize(&mut self, down_to: usize) {
|
|
while self.unchecked_nodes.len() > down_to {
|
|
let unchecked = self.unchecked_nodes.pop().unwrap();
|
|
let parent = &unchecked.parent;
|
|
let child = {
|
|
let parent_ref = parent.borrow();
|
|
Rc::clone(parent_ref.children[unchecked.character as usize].as_ref().unwrap())
|
|
};
|
|
|
|
let hash = child.borrow().structure_hash();
|
|
let mut found_replacement: Option<NodeRc> = Option::None;
|
|
|
|
if let Some(bucket) = self.minimized_nodes.get(&hash) {
|
|
for existing in bucket {
|
|
if child.borrow().structure_eq(&existing.borrow()) {
|
|
found_replacement = Some(Rc::clone(existing));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(replacement) = found_replacement {
|
|
parent.borrow_mut().children[unchecked.character as usize] = Some(replacement);
|
|
} else {
|
|
self.minimized_nodes.entry(hash).or_default().push(Rc::clone(&child));
|
|
}
|
|
}
|
|
}
|
|
|
|
fn calc_numbers(&mut self) {
|
|
self.root.borrow_mut().calc_numbers();
|
|
}
|
|
|
|
fn get_unique_index(&self, word: &str) -> Option<usize> {
|
|
let mut index: usize = 0;
|
|
let mut current = Rc::clone(&self.root);
|
|
|
|
for c in word.bytes() {
|
|
let next = {
|
|
let node = current.borrow();
|
|
let child = node.children[c as usize].as_ref()?;
|
|
for sibling_c in 0u8..128 {
|
|
if let Some(sibling) = &node.children[sibling_c as usize]
|
|
&& sibling_c < c
|
|
{
|
|
index += sibling.borrow().number as usize;
|
|
}
|
|
}
|
|
Rc::clone(child)
|
|
};
|
|
if next.borrow().is_terminal {
|
|
index += 1;
|
|
}
|
|
current = next;
|
|
}
|
|
|
|
Some(index)
|
|
}
|
|
}
|
|
|
|
fn queue_children(
|
|
node: &NodeRc,
|
|
queue: &mut Vec<NodeRc>,
|
|
child_indexes: &mut HashMap<*const RefCell<Node>, u16>,
|
|
first_available_index: u16,
|
|
) -> u16 {
|
|
let mut current = first_available_index;
|
|
let node_ref = node.borrow();
|
|
for c in 0..128u8 {
|
|
if let Some(child) = &node_ref.children[c as usize] {
|
|
let key = Rc::as_ptr(child);
|
|
if let std::collections::hash_map::Entry::Vacant(entry) = child_indexes.entry(key) {
|
|
let num_children = child.borrow().num_direct_children();
|
|
if num_children > 0 {
|
|
entry.insert(current);
|
|
current += num_children as u16;
|
|
}
|
|
queue.push(Rc::clone(child));
|
|
}
|
|
}
|
|
}
|
|
current
|
|
}
|
|
|
|
fn write_children_data(
|
|
node: &NodeRc,
|
|
node_data: &mut Vec<NodeData>,
|
|
queue: &mut Vec<NodeRc>,
|
|
child_indexes: &mut HashMap<*const RefCell<Node>, u16>,
|
|
first_available_index: u16,
|
|
) -> u16 {
|
|
let mut current = first_available_index;
|
|
let mut unique_index_tally: u8 = 0;
|
|
let node_ref = node.borrow();
|
|
for c in 0..128u8 {
|
|
if let Some(child) = &node_ref.children[c as usize] {
|
|
let key = Rc::as_ptr(child);
|
|
let child_ref = child.borrow();
|
|
let num_children = child_ref.num_direct_children();
|
|
|
|
if let std::collections::hash_map::Entry::Vacant(entry) = child_indexes.entry(key) {
|
|
if num_children > 0 {
|
|
entry.insert(current);
|
|
current += num_children as u16;
|
|
}
|
|
queue.push(Rc::clone(child));
|
|
}
|
|
|
|
node_data.push(NodeData {
|
|
character: c,
|
|
number: unique_index_tally,
|
|
end_of_word: child_ref.is_terminal,
|
|
child_index: child_indexes.get(&key).copied().unwrap_or(0),
|
|
children_len: num_children,
|
|
});
|
|
|
|
unique_index_tally = unique_index_tally.wrapping_add(child_ref.number as u8);
|
|
}
|
|
}
|
|
current
|
|
}
|