dictgen-0.5.3/.cargo_vcs_info.json0000644000000001541046102023000125000ustar { "git": { "sha1": "b859c0df7f391deba73030f79b957e62b4d81dc6" }, "path_in_vcs": "crates/dictgen" }dictgen-0.5.3/Cargo.lock0000644000000100301046102023000104450ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "aho-corasick" version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] [[package]] name = "dictgen" version = "0.5.3" dependencies = [ "aho-corasick", "phf", "phf_codegen", "phf_shared", "unicase", ] [[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "phf" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ "phf_macros", "phf_shared", "serde", ] [[package]] name = "phf_codegen" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" dependencies = [ "phf_generator", "phf_shared", ] [[package]] name = "phf_generator" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ "fastrand", "phf_shared", ] [[package]] name = "phf_macros" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" dependencies = [ "phf_generator", "phf_shared", "proc-macro2", "quote", "syn", "unicase", ] [[package]] name = "phf_shared" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" dependencies = [ "siphasher", "unicase", ] [[package]] name = "proc-macro2" version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] [[package]] name = "serde" version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ "serde_core", ] [[package]] name = "serde_core" version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "siphasher" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "syn" version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "unicase" version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" dictgen-0.5.3/Cargo.toml0000644000000070271046102023000105040ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2024" rust-version = "1.91" name = "dictgen" version = "0.5.3" build = false include = [ "build.rs", "src/**/*", "Cargo.toml", "Cargo.lock", "LICENSE*", "README.md", "examples/**/*", ] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Compile-time case-insensitive map" readme = false keywords = [ "development", "spelling", "no_std", ] categories = [ "development-tools", "text-processing", ] license = "MIT OR Apache-2.0" repository = "https://github.com/crate-ci/typos" [package.metadata.docs.rs] all-features = true rustdoc-args = ["--generate-link-to-definition"] [features] aho-corasick = ["dep:aho-corasick"] codegen = [ "std", "dep:phf_codegen", ] default = ["std"] map = [ "dep:phf", "dep:phf_shared", ] std = [] [lib] name = "dictgen" path = "src/lib.rs" [dependencies.aho-corasick] version = "1.1.4" optional = true [dependencies.phf] version = "0.13" features = ["unicase"] optional = true [dependencies.phf_codegen] version = "0.13" optional = true [dependencies.phf_shared] version = "0.13" optional = true [dependencies.unicase] version = "2.9.0" [lints.clippy] bool_assert_comparison = "allow" branches_sharing_code = "allow" checked_conversions = "warn" collapsible_else_if = "allow" collapsible_if = "allow" create_dir = "warn" dbg_macro = "warn" debug_assert_with_mut_call = "warn" doc_markdown = "warn" empty_enums = "warn" enum_glob_use = "warn" expl_impl_clone_on_copy = "warn" explicit_deref_methods = "warn" explicit_into_iter_loop = "warn" fallible_impl_from = "warn" filter_map_next = "warn" flat_map_option = "warn" float_cmp_const = "warn" fn_params_excessive_bools = "warn" from_iter_instead_of_collect = "warn" if_same_then_else = "allow" implicit_clone = "warn" imprecise_flops = "warn" inconsistent_struct_constructor = "warn" inefficient_to_string = "warn" infinite_loop = "warn" invalid_upcast_comparisons = "warn" large_digit_groups = "warn" large_stack_arrays = "warn" large_types_passed_by_value = "warn" let_and_return = "allow" linkedlist = "warn" lossy_float_literal = "warn" macro_use_imports = "warn" mem_forget = "warn" mutex_integer = "warn" needless_continue = "allow" needless_for_each = "warn" negative_feature_names = "warn" path_buf_push_overwrite = "warn" ptr_as_ptr = "warn" rc_mutex = "warn" redundant_feature_names = "warn" ref_option_ref = "warn" rest_pat_in_fully_bound_structs = "warn" result_large_err = "allow" same_functions_in_if_condition = "warn" self_named_module_files = "warn" semicolon_if_nothing_returned = "warn" str_to_string = "warn" string_add = "warn" string_add_assign = "warn" string_lit_as_bytes = "warn" todo = "warn" trait_duplication_in_bounds = "warn" uninlined_format_args = "warn" verbose_file_reads = "warn" wildcard_imports = "warn" zero_sized_map_values = "warn" [lints.rust] unnameable_types = "warn" unreachable_pub = "warn" unsafe_op_in_unsafe_fn = "warn" unused_lifetimes = "warn" unused_macro_rules = "warn" unused_qualifications = "warn" [lints.rust.rust_2018_idioms] level = "warn" priority = -1 dictgen-0.5.3/Cargo.toml.orig000064400000000000000000000015351046102023000141410ustar 00000000000000[package] name = "dictgen" version = "0.5.3" description = "Compile-time case-insensitive map" categories = ["development-tools", "text-processing"] keywords = ["development", "spelling", "no_std"] repository.workspace = true license.workspace = true edition.workspace = true rust-version.workspace = true include.workspace = true [package.metadata.docs.rs] all-features = true rustdoc-args = ["--generate-link-to-definition"] [features] default = ["std"] std = [] codegen = ["std", "dep:phf_codegen"] map = ["dep:phf", "dep:phf_shared"] aho-corasick = ["dep:aho-corasick"] [dependencies] unicase = "2.9.0" phf = { version = "0.13", features = ["unicase"], optional = true } phf_shared = { version = "0.13", optional = true } phf_codegen = { version = "0.13", optional = true } aho-corasick = { version = "1.1.4", optional = true } [lints] workspace = true dictgen-0.5.3/src/aho_corasick.rs000064400000000000000000000102161046102023000150300ustar 00000000000000pub use ::aho_corasick::Anchored; pub use ::aho_corasick::Input; pub use ::aho_corasick::MatchKind; pub use ::aho_corasick::StartKind; pub use ::aho_corasick::automaton::Automaton; pub use ::aho_corasick::dfa::Builder; pub use ::aho_corasick::dfa::DFA; #[cfg(feature = "codegen")] pub struct AhoCorasickGen<'g> { pub(crate) r#gen: crate::DictGen<'g>, } #[cfg(feature = "codegen")] impl AhoCorasickGen<'_> { pub fn write( &self, file: &mut W, data: impl Iterator, V)>, ) -> Result<(), std::io::Error> { let mut data: Vec<_> = data.collect(); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned())); let name = self.r#gen.name; let value_type = self.r#gen.value_type; writeln!(file, "pub struct {name} {{")?; writeln!(file, " dfa: dictgen::aho_corasick::DFA,")?; writeln!( file, " unicode: &'static dictgen::OrderedMap, {value_type}>," )?; writeln!(file, "}}")?; writeln!(file)?; writeln!(file, "impl {name} {{")?; writeln!(file, " pub fn new() -> Self {{")?; writeln!( file, " static NEEDLES: &'static [&'static [u8]] = &[" )?; for (key, _value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) { let key = key.as_ref(); writeln!(file, " b{key:?},")?; } writeln!(file, " ];")?; writeln!( file, " let dfa = dictgen::aho_corasick::Builder::new()" )?; writeln!( file, " .match_kind(dictgen::aho_corasick::MatchKind::LeftmostLongest)" )?; writeln!( file, " .start_kind(dictgen::aho_corasick::StartKind::Anchored)" )?; writeln!(file, " .ascii_case_insensitive(true)")?; writeln!(file, " .build(NEEDLES)")?; writeln!(file, " .unwrap();")?; crate::DictGen::new() .name("UNICODE_TABLE") .value_type(value_type) .ordered_map() .write( file, data.iter() .filter(|(k, _)| !k.as_ref().is_ascii()) .map(|(k, v)| (k.as_ref(), v)), )?; writeln!(file)?; writeln!(file, " Self {{")?; writeln!(file, " dfa,")?; writeln!(file, " unicode: &UNICODE_TABLE,")?; writeln!(file, " }}")?; writeln!(file, " }}")?; writeln!(file)?; writeln!( file, " pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static {value_type}> {{" )?; writeln!( file, " static PATTERNID_MAP: &'static [{value_type}] = &[" )?; for (_key, value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) { writeln!(file, " {value},")?; } writeln!(file, " ];")?; writeln!(file, " if word.is_ascii() {{")?; writeln!( file, " use dictgen::aho_corasick::Automaton as _;" )?; writeln!( file, " let input = dictgen::aho_corasick::Input::new(word.into_inner().as_bytes()).anchored(dictgen::aho_corasick::Anchored::Yes);" )?; writeln!( file, " let mat = self.dfa.try_find(&input).unwrap()?;" )?; writeln!( file, " if mat.end() == word.into_inner().len() {{" )?; writeln!(file, " return None;")?; writeln!(file, " }}")?; writeln!(file, " Some(&PATTERNID_MAP[mat.pattern()])")?; writeln!(file, " }} else {{")?; writeln!(file, " self.unicode.find(word)")?; writeln!(file, " }}")?; writeln!(file, " }}")?; writeln!(file, "}}")?; Ok(()) } } dictgen-0.5.3/src/gen.rs000064400000000000000000000027611046102023000131620ustar 00000000000000#[cfg(feature = "codegen")] pub struct DictGen<'g> { pub(crate) name: &'g str, pub(crate) value_type: &'g str, } impl DictGen<'static> { pub fn new() -> Self { Self { name: "DICT", value_type: "&'static str", } } } impl<'g> DictGen<'g> { pub fn name<'n>(self, name: &'n str) -> DictGen<'n> where 'g: 'n, { DictGen { name, value_type: self.value_type, } } pub fn value_type<'t>(self, value_type: &'t str) -> DictGen<'t> where 'g: 't, { DictGen { name: self.name, value_type, } } #[cfg(feature = "map")] pub fn map(self) -> crate::MapGen<'g> { crate::MapGen { r#gen: self, unicode: true, unicase: true, } } pub fn ordered_map(self) -> crate::OrderedMapGen<'g> { crate::OrderedMapGen { r#gen: self, unicode: true, unicase: true, } } pub fn trie(self) -> crate::TrieGen<'g> { crate::TrieGen { r#gen: self, limit: 64, } } pub fn r#match(self) -> crate::MatchGen<'g> { crate::MatchGen { r#gen: self } } #[cfg(feature = "aho-corasick")] pub fn aho_corasick(self) -> crate::AhoCorasickGen<'g> { crate::AhoCorasickGen { r#gen: self } } } impl Default for DictGen<'static> { fn default() -> Self { Self::new() } } dictgen-0.5.3/src/insensitive.rs000064400000000000000000000126541046102023000147530ustar 00000000000000/// `UniCase` look-alike that avoids const-fn so large tables don't OOM #[derive(Copy, Clone)] pub enum InsensitiveStr<'s> { Unicode(&'s str), Ascii(&'s str), } impl<'s> InsensitiveStr<'s> { pub fn convert(self) -> unicase::UniCase<&'s str> { match self { InsensitiveStr::Unicode(s) => unicase::UniCase::unicode(s), InsensitiveStr::Ascii(s) => unicase::UniCase::ascii(s), } } pub fn into_inner(self) -> &'s str { match self { InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s, } } pub fn is_empty(self) -> bool { match self { InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.is_empty(), } } pub fn len(self) -> usize { match self { InsensitiveStr::Unicode(s) | InsensitiveStr::Ascii(s) => s.len(), } } } impl<'s> From> for InsensitiveStr<'s> { fn from(other: unicase::UniCase<&'s str>) -> Self { if other.is_ascii() { InsensitiveStr::Ascii(other.into_inner()) } else { InsensitiveStr::Unicode(other.into_inner()) } } } impl<'s2> PartialEq> for InsensitiveStr<'_> { #[inline] fn eq(&self, other: &InsensitiveStr<'s2>) -> bool { self.convert() == other.convert() } } impl Eq for InsensitiveStr<'_> {} impl PartialOrd for InsensitiveStr<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for InsensitiveStr<'_> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.convert().cmp(&other.convert()) } } impl core::hash::Hash for InsensitiveStr<'_> { #[inline] fn hash(&self, hasher: &mut H) { self.convert().hash(hasher); } } impl core::fmt::Debug for InsensitiveStr<'_> { #[inline] fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { core::fmt::Debug::fmt(self.into_inner(), fmt) } } impl core::fmt::Display for InsensitiveStr<'_> { #[inline] fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { core::fmt::Display::fmt(self.into_inner(), fmt) } } #[cfg(feature = "map")] impl phf_shared::PhfHash for InsensitiveStr<'_> { #[inline] fn phf_hash(&self, state: &mut H) { core::hash::Hash::hash(self, state); } } #[cfg(feature = "map")] impl phf_shared::FmtConst for InsensitiveStr<'_> { fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { InsensitiveStr::Ascii(_) => f.write_str("dictgen::InsensitiveStr::Ascii(")?, InsensitiveStr::Unicode(_) => { f.write_str("dictgen::InsensitiveStr::Unicode(")?; } } self.into_inner().fmt_const(f)?; f.write_str(")") } } #[cfg(feature = "map")] impl<'b, 'a: 'b> phf_shared::PhfBorrow> for InsensitiveStr<'a> { fn borrow(&self) -> &InsensitiveStr<'b> { self } } /// `UniCase` look-alike that avoids const-fn so large tables don't OOM #[derive(Copy, Clone)] pub struct InsensitiveAscii<'s>(pub &'s str); impl<'s> InsensitiveAscii<'s> { pub fn convert(self) -> unicase::Ascii<&'s str> { unicase::Ascii::new(self.0) } pub fn into_inner(self) -> &'s str { self.0 } pub fn is_empty(self) -> bool { self.0.is_empty() } pub fn len(self) -> usize { self.0.len() } } impl<'s> From> for InsensitiveAscii<'s> { fn from(other: unicase::Ascii<&'s str>) -> Self { Self(other.into_inner()) } } impl<'s2> PartialEq> for InsensitiveAscii<'_> { #[inline] fn eq(&self, other: &InsensitiveAscii<'s2>) -> bool { self.convert() == other.convert() } } impl Eq for InsensitiveAscii<'_> {} impl PartialOrd for InsensitiveAscii<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for InsensitiveAscii<'_> { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.convert().cmp(&other.convert()) } } impl core::hash::Hash for InsensitiveAscii<'_> { #[inline] fn hash(&self, hasher: &mut H) { self.convert().hash(hasher); } } impl core::fmt::Debug for InsensitiveAscii<'_> { #[inline] fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { core::fmt::Debug::fmt(self.into_inner(), fmt) } } impl core::fmt::Display for InsensitiveAscii<'_> { #[inline] fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { core::fmt::Display::fmt(self.into_inner(), fmt) } } #[cfg(feature = "map")] impl phf_shared::PhfHash for InsensitiveAscii<'_> { #[inline] fn phf_hash(&self, state: &mut H) { core::hash::Hash::hash(self, state); } } #[cfg(feature = "map")] impl phf_shared::FmtConst for InsensitiveAscii<'_> { fn fmt_const(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.write_str("dictgen::InsensitiveAscii(")?; self.into_inner().fmt_const(f)?; f.write_str(")") } } #[cfg(feature = "map")] impl<'b, 'a: 'b> phf_shared::PhfBorrow> for InsensitiveAscii<'a> { fn borrow(&self) -> &InsensitiveAscii<'b> { self } } dictgen-0.5.3/src/lib.rs000064400000000000000000000011451046102023000131520ustar 00000000000000#![cfg_attr(docsrs, feature(doc_cfg))] #![warn(clippy::print_stderr)] #![warn(clippy::print_stdout)] #[cfg(feature = "aho-corasick")] pub mod aho_corasick; #[cfg(feature = "codegen")] mod r#gen; mod insensitive; #[cfg(feature = "map")] mod map; #[cfg(feature = "codegen")] mod r#match; mod ordered_map; mod trie; #[cfg(feature = "aho-corasick")] #[cfg(feature = "codegen")] pub use aho_corasick::AhoCorasickGen; #[cfg(feature = "codegen")] pub use r#gen::*; pub use insensitive::*; #[cfg(feature = "map")] pub use map::*; #[cfg(feature = "codegen")] pub use r#match::*; pub use ordered_map::*; pub use trie::*; dictgen-0.5.3/src/map.rs000064400000000000000000000107701046102023000131650ustar 00000000000000#[cfg(feature = "codegen")] pub struct MapGen<'g> { pub(crate) r#gen: crate::DictGen<'g>, pub(crate) unicase: bool, pub(crate) unicode: bool, } #[cfg(feature = "codegen")] impl MapGen<'_> { pub fn unicase(mut self, yes: bool) -> Self { self.unicase = yes; self } pub fn unicode(mut self, yes: bool) -> Self { self.unicode = yes; self } pub fn write( &self, file: &mut W, data: impl Iterator, V)>, ) -> Result<(), std::io::Error> { let mut data: Vec<_> = data.collect(); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned())); let name = self.r#gen.name; let key_type = self.key_type(); let value_type = self.r#gen.value_type; let mut smallest = usize::MAX; let mut largest = usize::MIN; for (key, _) in data.iter() { let key = key.as_ref(); smallest = std::cmp::min(smallest, key.len()); largest = std::cmp::max(largest, key.len()); } if largest == 0 { smallest = 0; } writeln!( file, "pub static {name}: dictgen::Map<{key_type}, {value_type}> = dictgen::Map {{" )?; match (self.unicase, self.unicode) { (true, true) => { let mut builder = phf_codegen::Map::new(); let data = data .iter() .map(|(key, value)| { let key = key.as_ref(); ( if key.is_ascii() { crate::InsensitiveStr::Ascii(key) } else { crate::InsensitiveStr::Unicode(key) }, value.to_string(), ) }) .collect::>(); for (key, value) in data.iter() { builder.entry(key, value.as_str()); } let builder = builder.build(); writeln!(file, " map: {builder},")?; } (true, false) => { let mut builder = phf_codegen::Map::new(); let data = data .iter() .map(|(key, value)| (crate::InsensitiveAscii(key.as_ref()), value.to_string())) .collect::>(); for (key, value) in data.iter() { builder.entry(key, value.as_str()); } let builder = builder.build(); writeln!(file, " map: {builder},")?; } (false, _) => { let mut builder = phf_codegen::Map::new(); let data = data .iter() .map(|(key, value)| (key, value.to_string())) .collect::>(); for (key, value) in data.iter() { builder.entry(key.as_ref(), value.as_str()); } let builder = builder.build(); writeln!(file, " map: {builder},")?; } } writeln!(file, " range: {smallest}..={largest},")?; writeln!(file, "}};")?; Ok(()) } fn key_type(&self) -> &'static str { match (self.unicase, self.unicode) { (true, true) => "dictgen::InsensitiveStr<'static>", (true, false) => "dictgen::InsensitiveAscii<'static>", (false, _) => "&'static str", } } } pub struct Map { pub map: phf::Map, pub range: std::ops::RangeInclusive, } impl Map, V> { #[inline] pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&V> { if self.range.contains(&word.len()) { self.map.get(&(*word).into()) } else { None } } } impl Map, V> { #[inline] pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&V> { if self.range.contains(&word.len()) { self.map.get(&(*word).into()) } else { None } } } impl Map<&str, V> { #[inline] pub fn find(&self, word: &'_ &str) -> Option<&V> { if self.range.contains(&word.len()) { self.map.get(word) } else { None } } } dictgen-0.5.3/src/match.rs000064400000000000000000000022401046102023000134750ustar 00000000000000#[cfg(feature = "codegen")] pub struct MatchGen<'g> { pub(crate) r#gen: crate::DictGen<'g>, } #[cfg(feature = "codegen")] impl MatchGen<'_> { pub fn write( &self, file: &mut W, data: impl Iterator, V)>, ) -> Result<(), std::io::Error> { let mut data: Vec<_> = data.collect(); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned())); let name = self.r#gen.name; let value_type = self.r#gen.value_type; writeln!(file, "pub struct {name};")?; writeln!(file, "impl {name} {{")?; writeln!( file, " pub fn find(&self, word: &&str) -> Option<&'static {value_type}> {{" )?; writeln!(file, " match *word {{")?; for (key, value) in data.iter() { let key = key.as_ref(); writeln!(file, " {key:?} => Some(&{value}.as_slice()),")?; } writeln!(file, " _ => None,")?; writeln!(file, " }}")?; writeln!(file, " }}")?; writeln!(file, "}}")?; Ok(()) } } dictgen-0.5.3/src/ordered_map.rs000064400000000000000000000073571046102023000147000ustar 00000000000000#[cfg(feature = "codegen")] pub struct OrderedMapGen<'g> { pub(crate) r#gen: crate::DictGen<'g>, pub(crate) unicase: bool, pub(crate) unicode: bool, } #[cfg(feature = "codegen")] impl OrderedMapGen<'_> { pub fn unicase(mut self, yes: bool) -> Self { self.unicase = yes; self } pub fn unicode(mut self, yes: bool) -> Self { self.unicode = yes; self } pub fn write( &self, file: &mut W, data: impl Iterator, V)>, ) -> Result<(), std::io::Error> { let mut data: Vec<_> = data.collect(); data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned())); let name = self.r#gen.name; let key_type = self.key_type(); let value_type = self.r#gen.value_type; let mut smallest = usize::MAX; let mut largest = usize::MIN; writeln!( file, "pub static {name}: dictgen::OrderedMap<{key_type}, {value_type}> = dictgen::OrderedMap {{" )?; writeln!(file, " keys: &[")?; for (key, _value) in data.iter() { let key = key.as_ref(); smallest = std::cmp::min(smallest, key.len()); largest = std::cmp::max(largest, key.len()); let key = self.key_new(key); writeln!(file, " {key},")?; } if largest == 0 { smallest = 0; } writeln!(file, " ],")?; writeln!(file, " values: &[")?; for (_key, value) in data.iter() { writeln!(file, " {value},")?; } writeln!(file, " ],")?; writeln!(file, " range: {smallest}..={largest},")?; writeln!(file, "}};")?; Ok(()) } fn key_type(&self) -> &'static str { match (self.unicase, self.unicode) { (true, true) => "dictgen::InsensitiveStr<'static>", (true, false) => "dictgen::InsensitiveAscii<'static>", (false, _) => "&'static str", } } fn key_new(&self, key: &str) -> String { match (self.unicase, self.unicode) { (true, true) => { if key.is_ascii() { format!("dictgen::InsensitiveStr::Ascii({key:?})") } else { format!("dictgen::InsensitiveStr::Unicode({key:?})") } } (true, false) => format!("dictgen::InsensitiveAscii({key:?})"), (false, _) => format!("{key:?}"), } } } pub struct OrderedMap { pub keys: &'static [K], pub values: &'static [V], pub range: core::ops::RangeInclusive, } impl OrderedMap, V> { #[inline] pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> { if self.range.contains(&word.len()) { self.keys .binary_search_by_key(word, |key| key.convert()) .map(|i| &self.values[i]) .ok() } else { None } } } impl OrderedMap, V> { #[inline] pub fn find(&self, word: &'_ unicase::Ascii<&str>) -> Option<&'static V> { if self.range.contains(&word.len()) { self.keys .binary_search_by_key(word, |key| key.convert()) .map(|i| &self.values[i]) .ok() } else { None } } } impl OrderedMap<&str, V> { #[inline] pub fn find(&self, word: &'_ &str) -> Option<&'static V> { if self.range.contains(&word.len()) { self.keys.binary_search(word).map(|i| &self.values[i]).ok() } else { None } } } dictgen-0.5.3/src/trie.rs000064400000000000000000000253051046102023000133530ustar 00000000000000#[cfg(feature = "codegen")] pub struct TrieGen<'g> { pub(crate) r#gen: crate::DictGen<'g>, pub(crate) limit: usize, } #[cfg(feature = "codegen")] impl TrieGen<'_> { pub fn limit(mut self, limit: usize) -> Self { self.limit = limit; self } /// # Panics /// /// - On duplicate entry pub fn write<'d, W: std::io::Write, V: std::fmt::Display>( &self, file: &mut W, data: impl Iterator, ) -> Result<(), std::io::Error> { let name = self.r#gen.name; let value_type = self.r#gen.value_type; codegen::generate_trie(file, name, value_type, data, self.limit) } } pub struct Trie { pub root: &'static TrieNode, pub unicode: &'static crate::OrderedMap, V>, pub range: core::ops::RangeInclusive, } impl Trie { #[inline] pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static V> { if word .into_inner() .as_bytes() .iter() .all(|b| b.is_ascii_alphabetic()) { if self.range.contains(&word.len()) { self.find_ascii(word.as_bytes()) } else { None } } else { self.unicode.find(word) } } fn find_ascii(&self, word: &[u8]) -> Option<&'static V> { let mut child = &self.root; for i in 0..word.len() { match child.children { TrieChild::Nested(n) => { let byte = word[i]; let index = if byte.is_ascii_lowercase() { byte - b'a' } else if byte.is_ascii_uppercase() { byte - b'A' } else { return None; }; debug_assert!(index < 26); if let Some(next) = n[index as usize].as_ref() { child = next; } else { return None; } } TrieChild::Flat(t) => { let remaining = &word[i..word.len()]; // Unsafe: Everything before has been proven to be ASCII, so this should be // safe. let remaining = unsafe { core::str::from_utf8_unchecked(remaining) }; let remaining = unicase::Ascii::new(remaining); return t.find(&remaining); } } } child.value.as_ref() } } pub struct TrieNode { pub children: TrieChild, pub value: Option, } pub enum TrieChild { Nested(&'static [Option<&'static TrieNode>; 26]), Flat(&'static crate::OrderedMap, V>), } #[cfg(feature = "codegen")] mod codegen { pub(super) fn generate_trie<'d, W: std::io::Write, V: std::fmt::Display>( file: &mut W, name: &str, value_type: &str, data: impl Iterator, limit: usize, ) -> Result<(), std::io::Error> { let mut root = DynRoot::new(data); root.burst(limit); let unicode_table_name = format!("{name}_UNICODE_TABLE"); writeln!( file, "pub static {name}: dictgen::Trie<{value_type}> = dictgen::Trie {{" )?; writeln!(file, " root: &{},", gen_node_name(name, ""))?; writeln!(file, " unicode: &{},", &unicode_table_name)?; writeln!( file, " range: {}..={},", root.range.start(), root.range.end() )?; writeln!(file, "}};")?; writeln!(file)?; crate::DictGen::new() .name(&unicode_table_name) .value_type(value_type) .ordered_map() .write(file, root.unicode.into_iter())?; writeln!(file)?; let mut nodes = vec![("".to_owned(), &root.root)]; while let Some((start, node)) = nodes.pop() { let node_name = gen_node_name(name, &start); let children_name = gen_children_name(name, &start); writeln!( file, "static {node_name}: dictgen::TrieNode<{value_type}> = dictgen::TrieNode {{" )?; writeln!( file, " children: {}(&{}),", gen_type_name(&node.children), children_name )?; if let Some(value) = node.value.as_ref() { writeln!(file, " value: Some({value}),")?; } else { writeln!(file, " value: None,")?; } writeln!(file, "}};")?; writeln!(file)?; match &node.children { DynChild::Nested(n) => { writeln!( file, "static {children_name}: [Option<&dictgen::TrieNode<{value_type}>>; 26] = [", )?; for b in b'a'..=b'z' { if let Some(child) = n.get(&b) { let c = b as char; let next_start = format!("{start}{c}"); writeln!(file, " Some(&{}),", gen_node_name(name, &next_start))?; nodes.push((next_start, child)); } else { writeln!(file, " None,")?; } } writeln!(file, "];")?; } DynChild::Flat(v) => { let table_input = v.iter().map(|(k, v)| { let k = std::str::from_utf8(k).expect("this was originally a `str`"); (k, v) }); crate::DictGen::new() .name(&children_name) .value_type(value_type) .ordered_map() .unicode(false) .write(file, table_input)?; } } writeln!(file)?; writeln!(file)?; } Ok(()) } fn gen_node_name(prefix: &str, start: &str) -> String { if start.is_empty() { format!("{prefix}_NODE") } else { let mut start = start.to_owned(); start.make_ascii_uppercase(); format!("{prefix}_{start}_NODE") } } fn gen_children_name(prefix: &str, start: &str) -> String { if start.is_empty() { format!("{prefix}_CHILDREN") } else { let mut start = start.to_owned(); start.make_ascii_uppercase(); format!("{prefix}_{start}_CHILDREN") } } fn gen_type_name(leaf: &DynChild<'_, V>) -> &'static str { match leaf { DynChild::Nested(_) => "dictgen::TrieChild::Nested", DynChild::Flat(_) => "dictgen::TrieChild::Flat", } } struct DynRoot<'s, V> { root: DynNode<'s, V>, unicode: Vec<(&'s str, V)>, range: std::ops::RangeInclusive, } impl<'s, V> DynRoot<'s, V> { fn new(data: impl Iterator) -> Self { let mut overflow = Vec::new(); let mut unicode = Vec::default(); let mut smallest = usize::MAX; let mut largest = usize::MIN; let mut existing = std::collections::HashSet::new(); let mut empty = None; for (key, value) in data { if existing.contains(key) { panic!("Duplicate present: {key}"); } existing.insert(key); if key.is_empty() { empty = Some(value); } else { smallest = std::cmp::min(smallest, key.len()); largest = std::cmp::max(largest, key.len()); if key.bytes().all(|b| b.is_ascii_alphabetic()) { overflow.push((key.as_bytes(), value)); } else { unicode.push((key, value)); } } } Self { root: DynNode { children: DynChild::Flat(overflow), value: empty, }, unicode, range: smallest..=largest, } } fn burst(&mut self, limit: usize) { self.root.burst(limit); } } struct DynNode<'s, V> { children: DynChild<'s, V>, value: Option, } impl DynNode<'_, V> { fn burst(&mut self, limit: usize) { self.children.burst(limit); } } enum DynChild<'s, V> { Nested(std::collections::BTreeMap>), Flat(Vec<(&'s [u8], V)>), } impl DynChild<'_, V> { fn burst(&mut self, limit: usize) { match self { DynChild::Nested(children) => { for child in children.values_mut() { child.burst(limit); } } DynChild::Flat(v) if v.len() < limit => (), DynChild::Flat(v) => { let mut old_v = Vec::new(); std::mem::swap(&mut old_v, v); let mut nodes = std::collections::BTreeMap::new(); for (key, value) in old_v { assert!(!key.is_empty()); let start = key[0].to_ascii_lowercase(); assert!(start.is_ascii_alphabetic()); let node = nodes.entry(start).or_insert_with(|| DynNode { children: DynChild::Flat(Vec::new()), value: None, }); let remaining = &key[1..]; if remaining.is_empty() { assert!(node.value.is_none()); node.value = Some(value); } else { match &mut node.children { DynChild::Nested(_) => { unreachable!("Only overflow at this point") } DynChild::Flat(v) => { v.push((remaining, value)); } } } } *self = DynChild::Nested(nodes); self.burst(limit); } } } } }