Implement UAX#44-LM2 for name normalization (#46)

emilyyyylime · web-flow · commit d6b2923654a8 · 2025-06-15T13:37:21.000+02:00
This follows UAX's example procedure:

&gt; An implementation of this loose matching rule can obtain the correct results when comparing two strings by doing the following three operations, in order:

&gt; * remove all medial hyphens (except the medial hyphen in the name for U+1180)
&gt; * remove all whitespace and underscore characters
&gt; * apply toLowercase() to both strings
&gt; * After applying these three operations, if the two strings compare binary equal, then they are considered to match.

and generating the name-&gt;codepoint PHF with the mapped ("normalised") names as keys.

This isn't a breaking change; in particular, any name that mapped to a character before will continue mapping to that same character.
diff --git a/generator/src/lib.rs b/generator/src/lib.rs
@@ -156,7 +156,7 @@ fn create_lexicon_and_offsets(
                 // insert the suffixes of this word which saves about
                 // 10KB (we could theoretically insert all substrings,
                 // upto a certain length, but this only saves ~300
-                // bytes or so and is noticably slower).
+                // bytes or so and is noticeably slower).
                 for i in 1..n.len() {
                     if t.insert(n[i..].bytes(), Some(offset + i), true).0 {
                         // once we've found a string that's already
@@ -307,9 +307,9 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
     // currently huge, but it has a lot of 0's, so we compress it
     // using the binning, below.
     let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::<Vec<_>>();
-    let mut longest_name = 0;
+    let mut longest_name = String::new();
     for &(cp, name) in codepoint_names.iter() {
-        longest_name = cmp::max(name.len(), longest_name);
+        longest_name = cmp::max_by_key(normalise_name(name, cp), longest_name, |s| s.len());
 
         let start = phrasebook.len() as u32;
         phrasebook_offsets[cp as usize] = start;
@@ -337,8 +337,8 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
 
     w!(
         ctxt,
-        "pub const MAX_NAME_LENGTH: usize = {};\n",
-        longest_name
+        "pub const LONGEST_NAME: &str = {longest_name:?};\n\
+        pub const LONGEST_NAME_LEN: usize = LONGEST_NAME.len();\n"
     );
     ctxt.write_plain_string("LEXICON", &lexicon_string);
     ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets);
@@ -407,6 +407,11 @@ pub fn generate_phf(
 ) {
     let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate);
 
+    let codepoint_names: Vec<_> = codepoint_names
+        .into_iter()
+        .map(|(c, s)| (c, normalise_name(s, c)))
+        .collect();
+
     let mut ctxt = make_context(path);
     let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries);
 
@@ -420,6 +425,41 @@ pub fn generate_phf(
     }
 }
 
+/// Convert a Unicode name to a form that can be used for loose matching, as per
+/// [UAX#44](https://www.unicode.org/reports/tr44/tr44-34.html#Matching_Names)
+///
+/// This function matches `unicode_names2::normalise_name` in implementation, thus the result of one
+/// can be used to query a PHF generated from the other.
+fn normalise_name(s: &str, codepoint: char) -> String {
+    let mut normalised = String::new();
+    let bytes = s.as_bytes();
+    for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() {
+        // "Ignore case, whitespace, underscore ('_'), [...]"
+        if c.is_ascii_whitespace() || c == b'_' {
+            continue;
+        }
+
+        // "[...] and all medial hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E."
+        if codepoint != '\u{1180}' // HANGUL JUNGSEONG O-E
+            && c == b'-'
+            && bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric)
+            && bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric)
+        {
+            continue;
+        }
+        assert!(
+            c.is_ascii_alphanumeric() || c == b'-',
+            "U+{:04X} contains an invalid character for a Unicode name: {:?}",
+            codepoint as u32,
+            s
+        );
+
+        normalised.push(c as char);
+    }
+
+    normalised
+}
+
 pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option<usize>) {
     let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate);
     let mut ctxt = make_context(path);
diff --git a/generator/src/phf.rs b/generator/src/phf.rs
@@ -40,14 +40,14 @@ struct Hash {
 
 #[allow(clippy::type_complexity)]
 fn try_phf_table(
-    values: &[(char, &str)],
+    values: &[(char, String)],
     lambda: usize,
     seed: u64,
     rng: &mut StdRng,
 ) -> Option<(Vec<(u32, u32)>, Vec<char>)> {
     let hashes: Vec<_> = values
         .iter()
-        .map(|&(n, s)| (split(hash(s, seed)), n))
+        .map(|(n, s)| (split(hash(s, seed)), *n))
         .collect();
 
     let table_len = hashes.len();
@@ -140,7 +140,7 @@ fn try_phf_table(
 }
 
 pub fn create_phf(
-    data: &[(char, &str)],
+    data: &[(char, String)],
     lambda: usize,
     max_tries: usize,
 ) -> (u64, Vec<(u32, u32)>, Vec<char>) {
diff --git a/src/lib.rs b/src/lib.rs