diff --git a/generator/src/lib.rs b/generator/src/lib.rs index 98e601e..2c017f4 100644 --- a/generator/src/lib.rs +++ b/generator/src/lib.rs @@ -156,7 +156,7 @@ fn create_lexicon_and_offsets( // insert the suffixes of this word which saves about // 10KB (we could theoretically insert all substrings, // upto a certain length, but this only saves ~300 - // bytes or so and is noticably slower). + // bytes or so and is noticeably slower). for i in 1..n.len() { if t.insert(n[i..].bytes(), Some(offset + i), true).0 { // once we've found a string that's already @@ -307,9 +307,9 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>) // currently huge, but it has a lot of 0's, so we compress it // using the binning, below. let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::>(); - let mut longest_name = 0; + let mut longest_name = String::new(); for &(cp, name) in codepoint_names.iter() { - longest_name = cmp::max(name.len(), longest_name); + longest_name = cmp::max_by_key(normalise_name(name, cp), longest_name, |s| s.len()); let start = phrasebook.len() as u32; phrasebook_offsets[cp as usize] = start; @@ -337,8 +337,8 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>) w!( ctxt, - "pub const MAX_NAME_LENGTH: usize = {};\n", - longest_name + "pub const LONGEST_NAME: &str = {longest_name:?};\n\ + pub const LONGEST_NAME_LEN: usize = LONGEST_NAME.len();\n" ); ctxt.write_plain_string("LEXICON", &lexicon_string); ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets); @@ -407,6 +407,11 @@ pub fn generate_phf( ) { let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate); + let codepoint_names: Vec<_> = codepoint_names + .into_iter() + .map(|(c, s)| (c, normalise_name(s, c))) + .collect(); + let mut ctxt = make_context(path); let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries); @@ -420,6 +425,41 @@ pub fn generate_phf( } } +/// Convert a Unicode name to a form that can be used for loose matching, as per +/// [UAX#44](https://www.unicode.org/reports/tr44/tr44-34.html#Matching_Names) +/// +/// This function matches `unicode_names2::normalise_name` in implementation, thus the result of one +/// can be used to query a PHF generated from the other. +fn normalise_name(s: &str, codepoint: char) -> String { + let mut normalised = String::new(); + let bytes = s.as_bytes(); + for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() { + // "Ignore case, whitespace, underscore ('_'), [...]" + if c.is_ascii_whitespace() || c == b'_' { + continue; + } + + // "[...] and all medial hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E." + if codepoint != '\u{1180}' // HANGUL JUNGSEONG O-E + && c == b'-' + && bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric) + && bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric) + { + continue; + } + assert!( + c.is_ascii_alphanumeric() || c == b'-', + "U+{:04X} contains an invalid character for a Unicode name: {:?}", + codepoint as u32, + s + ); + + normalised.push(c as char); + } + + normalised +} + pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option) { let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate); let mut ctxt = make_context(path); diff --git a/generator/src/phf.rs b/generator/src/phf.rs index 7b2b514..6c19128 100644 --- a/generator/src/phf.rs +++ b/generator/src/phf.rs @@ -40,14 +40,14 @@ struct Hash { #[allow(clippy::type_complexity)] fn try_phf_table( - values: &[(char, &str)], + values: &[(char, String)], lambda: usize, seed: u64, rng: &mut StdRng, ) -> Option<(Vec<(u32, u32)>, Vec)> { let hashes: Vec<_> = values .iter() - .map(|&(n, s)| (split(hash(s, seed)), n)) + .map(|(n, s)| (split(hash(s, seed)), *n)) .collect(); let table_len = hashes.len(); @@ -140,7 +140,7 @@ fn try_phf_table( } pub fn create_phf( - data: &[(char, &str)], + data: &[(char, String)], lambda: usize, max_tries: usize, ) -> (u64, Vec<(u32, u32)>, Vec) { diff --git a/src/lib.rs b/src/lib.rs index ad7dbf5..13d51fb 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,6 +47,26 @@ //! } //! ``` //! +//! # Loose Matching +//! For name->char retrieval (the `character` function and macros) this crate uses loose matching, +//! as defined in Unicode Standard Annex #44[^1]. +//! In general, this means case, whitespace and underscore characters are ignored, as well as +//! _medial hyphens_, which are hyphens (`-`) that come between two alphanumeric characters[^1]. +//! +//! Under this scheme, the query `Low_Line` will find `U+005F LOW LINE`, as well as `l o w L-I-N-E`, +//! `lowline`, and `low\nL-I-N-E`, but not `low- line`. +//! Similarly, `tibetan letter -a` will find `U+0F60 TIBETAN LETTER -A`, as well as +//! `tibetanletter - a` and `TIBETAN L_ETTE_R- __a__`, but not `tibetan letter-a` or +//! `TIBETAN LETTER A`. +//! +//! In the implementation of this crate, 'whitespace' is determined by the [`is_ascii_whitespace`] +//! method on `u8` and `char`. See its documentation for more info. +//! +//! [^1]: See [UAX44-LM2] for precise details. +//! +//! [UAX44-LM2]: https://www.unicode.org/reports/tr44/tr44-34.html#UAX44-LM2 +//! [`is_ascii_whitespace`]: char::is_ascii_whitespace +//! //! # Cargo-enabled //! //! This package is on crates.io, so add either (or both!) of the @@ -68,7 +88,7 @@ extern crate std; use core::{char, fmt}; use generated::{ - MAX_NAME_LENGTH, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT, + LONGEST_NAME_LEN, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT, }; #[allow(dead_code)] @@ -95,7 +115,9 @@ static ALIASES: phf::Map<&'static [u8], char> = mod iter_str; static HANGUL_SYLLABLE_PREFIX: &str = "HANGUL SYLLABLE "; +static NORMALISED_HANGUL_SYLLABLE_PREFIX: &str = "HANGULSYLLABLE"; static CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJK UNIFIED IDEOGRAPH-"; +static NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJKUNIFIEDIDEOGRAPH"; fn is_cjk_unified_ideograph(ch: char) -> bool { generated::CJK_IDEOGRAPH_RANGES @@ -103,11 +125,17 @@ fn is_cjk_unified_ideograph(ch: char) -> bool { .any(|&(lo, hi)| lo <= ch && ch <= hi) } -/// An iterator over the components of a code point's name, it also -/// implements `Show`. +/// An iterator over the components of a code point's name. Notably implements `Display`. +/// +/// To reconstruct the full Unicode name from this iterator, you can concatenate every string slice +/// yielded from it. Each such slice is either a word matching `[A-Z0-9]*`, a space `" "`, or a +/// hyphen `"-"`. (In particular, words can be the empty string `""`). /// -/// The size hint is exact for the number of pieces, but iterates -/// (although iteration is cheap and all names are short). +/// The [size hint] returns an exact size, by cloning the iterator and iterating it fully. +/// Cloning and iteration are cheap, and all names are relatively short, so this should not have a +/// high impact. +/// +/// [size hint]: std::iter::Iterator::size_hint #[derive(Clone)] pub struct Name { data: Name_, @@ -161,6 +189,7 @@ impl Name { impl Iterator for Name { type Item = &'static str; + fn next(&mut self) -> Option<&'static str> { match self.data { Name_::Plain(ref mut s) => s.next(), @@ -225,20 +254,16 @@ impl fmt::Display for Name { /// Find the name of `c`, or `None` if `c` has no name. /// -/// The return value is an iterator that yields `&str` components of -/// the name successively (including spaces and hyphens). It -/// implements `Show`, and thus can be used naturally to build -/// `String`s, or be printed, etc. +/// The return value is an iterator that yields `&'static str` components of the name successively +/// (including spaces and hyphens). It implements `Display`, so can be used naturally to build +/// `String`s or be printed. See also the [type-level docs][Name]. /// /// # Example /// /// ```rust -/// assert_eq!(unicode_names2::name('a').map(|n| n.to_string()), -/// Some("LATIN SMALL LETTER A".to_string())); -/// assert_eq!(unicode_names2::name('\u{2605}').map(|n| n.to_string()), -/// Some("BLACK STAR".to_string())); -/// assert_eq!(unicode_names2::name('☃').map(|n| n.to_string()), -/// Some("SNOWMAN".to_string())); +/// assert_eq!(unicode_names2::name('a').unwrap().to_string(), "LATIN SMALL LETTER A"); +/// assert_eq!(unicode_names2::name('\u{2605}').unwrap().to_string(), "BLACK STAR"); +/// assert_eq!(unicode_names2::name('☃').unwrap().to_string(), "SNOWMAN"); /// /// // control code /// assert!(unicode_names2::name('\x00').is_none()); @@ -321,32 +346,31 @@ fn character_by_alias(name: &[u8]) -> Option { /// Find the character called `name`, or `None` if no such character /// exists. /// -/// This searches case-insensitively. +/// This function uses the [UAX44-LM2] loose matching scheme for lookup. For more information, see +/// the [crate-level docs][self]. +/// +/// [UAX44-LM2]: https://www.unicode.org/reports/tr44/tr44-34.html#UAX44-LM2 /// /// # Example /// /// ```rust /// assert_eq!(unicode_names2::character("LATIN SMALL LETTER A"), Some('a')); -/// assert_eq!(unicode_names2::character("latin SMALL letter A"), Some('a')); -/// assert_eq!(unicode_names2::character("latin small letter a"), Some('a')); -/// assert_eq!(unicode_names2::character("BLACK STAR"), Some('★')); +/// assert_eq!(unicode_names2::character("latinsmalllettera"), Some('a')); +/// assert_eq!(unicode_names2::character("Black_Star"), Some('★')); /// assert_eq!(unicode_names2::character("SNOWMAN"), Some('☃')); /// assert_eq!(unicode_names2::character("BACKSPACE"), Some('\x08')); /// /// assert_eq!(unicode_names2::character("nonsense"), None); /// ``` pub fn character(search_name: &str) -> Option { - // + 1 so that we properly handle the case when `name` has a - // prefix of the longest name, but isn't exactly equal. - let mut buf = [0; MAX_NAME_LENGTH + 1]; - for (place, byte) in buf.iter_mut().zip(search_name.bytes()) { - *place = byte.to_ascii_uppercase(); - } - let search_name = buf.get(..search_name.len())?; + let original_name = search_name; + let mut buf = [0; LONGEST_NAME_LEN]; + let len = normalise_name(search_name, &mut buf); + let search_name = &buf[..len]; // try `HANGUL SYLLABLE ` - if search_name.starts_with(HANGUL_SYLLABLE_PREFIX.as_bytes()) { - let remaining = &search_name[HANGUL_SYLLABLE_PREFIX.len()..]; + if search_name.starts_with(NORMALISED_HANGUL_SYLLABLE_PREFIX.as_bytes()) { + let remaining = &search_name[NORMALISED_HANGUL_SYLLABLE_PREFIX.len()..]; let (choseong, remaining) = jamo::slice_shift_choseong(remaining); let (jungseong, remaining) = jamo::slice_shift_jungseong(remaining); let (jongseong, remaining) = jamo::slice_shift_jongseong(remaining); @@ -364,8 +388,8 @@ pub fn character(search_name: &str) -> Option { } // try `CJK UNIFIED IDEOGRAPH-` - if search_name.starts_with(CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) { - let remaining = &search_name[CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..]; + if search_name.starts_with(NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) { + let remaining = &search_name[NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..]; if remaining.len() > 5 { return None; } // avoid overflow @@ -378,10 +402,7 @@ pub fn character(search_name: &str) -> Option { _ => return None, } } - let ch = match char::from_u32(v) { - Some(ch) => ch, - None => return None, - }; + let ch = char::from_u32(v)?; // check if the resulting code is indeed in the known ranges if is_cjk_unified_ideograph(ch) { @@ -411,28 +432,92 @@ pub fn character(search_name: &str) -> Option { let maybe_name = match name(codepoint) { None => { if true { - debug_assert!(false) + debug_assert!(false) // what? } return character_by_alias(search_name); } Some(name) => name, }; - // run through the parts of the name, matching them against the - // parts of the input. - let mut passed_name = search_name; + // `name(codepoint)` returns an iterator yielding words separated by spaces or hyphens. + // That means whenever a name contains a non-medial hyphen, it must be emulated by inserting an + // artificial empty word (`""`) between the space and the hyphen. + let mut cmp_name = search_name; for part in maybe_name { - let part = part.as_bytes(); - let part_l = part.len(); - if passed_name.len() < part_l || &passed_name[..part_l] != part { + let part = match part { + "" => "-", // Non-medial hyphens are preserved by `normalise_name`, check them. + " " => continue, // Spaces and medial hyphens are removed, ignore them. + "-" if codepoint != '\u{1180}' => continue, // But the hyphen in U+1180 is preserved. + word => word, + }; + + if let Some(rest) = cmp_name.strip_prefix(part.as_bytes()) { + cmp_name = rest; + } else { return character_by_alias(search_name); } - passed_name = &passed_name[part_l..] + } + + // "HANGUL JUNGSEONG O-E" is ambiguous, returning U+116C HANGUL JUNGSEONG OE instead. + // All other ways of spelling U+1180 will get properly detected, so it's enough to just check + // if the hyphen is in the right place. + if codepoint == '\u{116C}' + && original_name + .trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '_') + .bytes() + .nth_back(1) + == Some(b'-') + { + return Some('\u{1180}'); } Some(codepoint) } +/// Convert a Unicode name to a form that can be used for loose matching, as per +/// [UAX#44](https://www.unicode.org/reports/tr44/tr44-34.html#Matching_Names). +/// +/// This function matches `unicode_names2_generator::normalise_name` in implementation, except that +/// the special case of U+1180 HANGUL JUNGSEONG O-E isn't handled here, because we don't yet know +/// which character is being queried and a string comparison would be expensive to inspect each +/// query with given it only matches for one character. Thus the case of U+1180 is handled at the +/// end of [`character`]. +fn normalise_name(search_name: &str, buf: &mut [u8; LONGEST_NAME_LEN]) -> usize { + let mut cursor = 0; + let bytes = search_name.as_bytes(); + + for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() { + // "Ignore case, whitespace, underscore ('_'), [...]" + if c.is_ascii_whitespace() || c == b'_' { + continue; + } + + // "[...] and all medial hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E." + // See doc comment for why U+1180 isn't handled + if c == b'-' + && bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric) + && bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric) + { + continue; + } + + if !c.is_ascii_alphanumeric() && c != b'-' { + // All unicode names comprise only of alphanumeric characters and hyphens after + // stripping spaces and underscores. Returning 0 effectively serves as returning `None`. + return 0; + } + + if cursor >= buf.len() { + // No Unicode character has this long a name. + return 0; + } + buf[cursor] = c; + cursor += 1; + } + + cursor +} + #[cfg(test)] mod tests { use super::*; @@ -515,9 +600,9 @@ mod tests { #[test] fn character_negative() { - let long_name = "x".repeat(100); - assert!(long_name.len() > MAX_NAME_LENGTH); // Otherwise this test is pointless - let names = ["", "x", "öäå", "SPAACE", &long_name]; + let long_name = "x".repeat(generated::LONGEST_NAME_LEN + 1); + let prefix = format!("{}x", generated::LONGEST_NAME); // This name would appear valid if truncated + let names = ["", "x", "öäå", "SPAACE", &long_name, &prefix]; for &n in names.iter() { assert_eq!(character(n), None); } @@ -614,6 +699,31 @@ mod tests { assert_eq!(super::character_by_alias(b"NOT AN ALIAS"), None); } + #[test] + fn test_uax44() { + assert_eq!(character(" L_O_W l_i_n_e"), Some('_')); + assert_eq!(character("space \x09\x0a\x0c\x0d"), Some(' ')); + assert_eq!(character("FULL S-T-O-P"), Some('.')); + assert_eq!(character("tibetan letter -a"), Some('\u{F60}')); + assert_eq!(character("tibetan letter- a"), Some('\u{F60}')); + assert_eq!(character("tibetan letter - a"), Some('\u{F60}')); + assert_eq!(character("tibetan letter_-_a"), Some('\u{F60}')); + assert_eq!(character("latinSMALLletterA"), Some('a')); + + // Test exceptions related to U+1180 + let jungseong_oe = Some('\u{116C}'); + let jungseong_o_e = Some('\u{1180}'); + assert_eq!(character("HANGUL JUNGSEONG OE"), jungseong_oe); + assert_eq!(character("HANGUL JUNGSEONG O_E"), jungseong_oe); + assert_eq!(character("HANGUL JUNGSEONG O E"), jungseong_oe); + assert_eq!(character("HANGUL JUNGSEONG O-E"), jungseong_o_e); + assert_eq!(character("HANGUL JUNGSEONG O-E\n"), jungseong_o_e); + assert_eq!(character("HANGUL JUNGSEONG O-E__"), jungseong_o_e); + assert_eq!(character("HANGUL JUNGSEONG O- E"), jungseong_o_e); + assert_eq!(character("HANGUL JUNGSEONG O -E"), jungseong_o_e); + assert_eq!(character("HANGUL JUNGSEONG O_-_E"), jungseong_o_e); + } + #[bench] fn name_basic(b: &mut Bencher) { b.iter(|| {