Implement UAX#44-LM2

emilyyyylime · emilyyyylime · commit 9865ad488267 · 2025-06-07T11:21:19.000+03:00
diff --git a/generator/src/lib.rs b/generator/src/lib.rs
@@ -156,7 +156,7 @@ fn create_lexicon_and_offsets(
                 // insert the suffixes of this word which saves about
                 // 10KB (we could theoretically insert all substrings,
                 // upto a certain length, but this only saves ~300
-                // bytes or so and is noticably slower).
+                // bytes or so and is noticeably slower).
                 for i in 1..n.len() {
                     if t.insert(n[i..].bytes(), Some(offset + i), true).0 {
                         // once we've found a string that's already
@@ -308,8 +308,10 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
     // using the binning, below.
     let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::<Vec<_>>();
     let mut longest_name = 0;
+    let mut longest_normalised_name = 0;
     for &(cp, name) in codepoint_names.iter() {
         longest_name = cmp::max(name.len(), longest_name);
+        longest_normalised_name = cmp::max(normalise_name(name, cp).len(), longest_normalised_name);
 
         let start = phrasebook.len() as u32;
         phrasebook_offsets[cp as usize] = start;
@@ -337,8 +339,8 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
 
     w!(
         ctxt,
-        "pub const MAX_NAME_LENGTH: usize = {};\n",
-        longest_name
+        "pub const MAX_NAME_LENGTH: usize = {longest_name};\n\
+        pub const MAX_NORMALISED_NAME_LENGTH: usize = {longest_normalised_name};\n",
     );
     ctxt.write_plain_string("LEXICON", &lexicon_string);
     ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets);
@@ -407,6 +409,11 @@ pub fn generate_phf(
 ) {
     let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate);
 
+    let codepoint_names: Vec<_> = codepoint_names
+        .into_iter()
+        .map(|(c, s)| (c, normalise_name(s, c)))
+        .collect();
+
     let mut ctxt = make_context(path);
     let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries);
 
@@ -420,6 +427,31 @@ pub fn generate_phf(
     }
 }
 
+fn normalise_name(s: &str, codepoint: char) -> String {
+    let mut normalised = String::new();
+    let bytes = s.as_bytes();
+    for (i, c) in bytes.into_iter().copied().enumerate() {
+        if c.is_ascii_whitespace() || c == b'_' {
+            continue;
+        }
+        if codepoint != '\u{1180}' // HANGUL JUNGSEONG O-E
+            && c == b'-'
+            && bytes.get(i - 1).is_some_and(u8::is_ascii_alphanumeric)
+            && bytes.get(i + 1).is_some_and(u8::is_ascii_alphanumeric)
+        {
+            continue;
+        }
+        assert!(
+            c.is_ascii_alphanumeric() || c == b'-',
+            "{:?} isn't a valid character for a Unicode name",
+            c as char
+        );
+        normalised.push(c.to_ascii_uppercase() as char);
+    }
+
+    normalised
+}
+
 pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option<usize>) {
     let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate);
     let mut ctxt = make_context(path);
diff --git a/generator/src/phf.rs b/generator/src/phf.rs
@@ -40,14 +40,14 @@ struct Hash {
 
 #[allow(clippy::type_complexity)]
 fn try_phf_table(
-    values: &[(char, &str)],
+    values: &[(char, String)],
     lambda: usize,
     seed: u64,
     rng: &mut StdRng,
 ) -> Option<(Vec<(u32, u32)>, Vec<char>)> {
     let hashes: Vec<_> = values
         .iter()
-        .map(|&(n, s)| (split(hash(s, seed)), n))
+        .map(|(n, s)| (split(hash(&s, seed)), *n))
         .collect();
 
     let table_len = hashes.len();
@@ -140,7 +140,7 @@ fn try_phf_table(
 }
 
 pub fn create_phf(
-    data: &[(char, &str)],
+    data: &[(char, String)],
     lambda: usize,
     max_tries: usize,
 ) -> (u64, Vec<(u32, u32)>, Vec<char>) {
diff --git a/src/lib.rs b/src/lib.rs
@@ -68,7 +68,7 @@ extern crate std;
 
 use core::{char, fmt};
 use generated::{
-    MAX_NAME_LENGTH, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT,
+    MAX_NORMALISED_NAME_LENGTH, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT,
 };
 
 #[allow(dead_code)]
@@ -95,7 +95,9 @@ static ALIASES: phf::Map<&'static [u8], char> =
 mod iter_str;
 
 static HANGUL_SYLLABLE_PREFIX: &str = "HANGUL SYLLABLE ";
+static NORMALISED_HANGUL_SYLLABLE_PREFIX: &str = "HANGULSYLLABLE";
 static CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJK UNIFIED IDEOGRAPH-";
+static NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJKUNIFIEDIDEOGRAPH";
 
 fn is_cjk_unified_ideograph(ch: char) -> bool {
     generated::CJK_IDEOGRAPH_RANGES
@@ -336,17 +338,16 @@ fn character_by_alias(name: &[u8]) -> Option<char> {
 /// assert_eq!(unicode_names2::character("nonsense"), None);
 /// ```
 pub fn character(search_name: &str) -> Option<char> {
+    let original_name = search_name;
     // + 1 so that we properly handle the case when `name` has a
     // prefix of the longest name, but isn't exactly equal.
-    let mut buf = [0; MAX_NAME_LENGTH + 1];
-    for (place, byte) in buf.iter_mut().zip(search_name.bytes()) {
-        *place = byte.to_ascii_uppercase();
-    }
-    let search_name = buf.get(..search_name.len())?;
+    let mut buf = [0; MAX_NORMALISED_NAME_LENGTH + 1];
+    let len = normalise(search_name, &mut buf[..]);
+    let search_name = &buf[..len];
 
     // try `HANGUL SYLLABLE <choseong><jungseong><jongseong>`
-    if search_name.starts_with(HANGUL_SYLLABLE_PREFIX.as_bytes()) {
-        let remaining = &search_name[HANGUL_SYLLABLE_PREFIX.len()..];
+    if search_name.starts_with(NORMALISED_HANGUL_SYLLABLE_PREFIX.as_bytes()) {
+        let remaining = &search_name[NORMALISED_HANGUL_SYLLABLE_PREFIX.len()..];
         let (choseong, remaining) = jamo::slice_shift_choseong(remaining);
         let (jungseong, remaining) = jamo::slice_shift_jungseong(remaining);
         let (jongseong, remaining) = jamo::slice_shift_jongseong(remaining);
@@ -364,8 +365,8 @@ pub fn character(search_name: &str) -> Option<char> {
     }
 
     // try `CJK UNIFIED IDEOGRAPH-<digits>`
-    if search_name.starts_with(CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) {
-        let remaining = &search_name[CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..];
+    if search_name.starts_with(NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) {
+        let remaining = &search_name[NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..];
         if remaining.len() > 5 {
             return None;
         } // avoid overflow
@@ -411,7 +412,7 @@ pub fn character(search_name: &str) -> Option<char> {
     let maybe_name = match name(codepoint) {
         None => {
             if true {
-                debug_assert!(false)
+                debug_assert!(false) // what?
             }
             return character_by_alias(search_name);
         }
@@ -420,19 +421,60 @@ pub fn character(search_name: &str) -> Option<char> {
 
     // run through the parts of the name, matching them against the
     // parts of the input.
-    let mut passed_name = search_name;
+    let mut cmp_name = search_name;
     for part in maybe_name {
-        let part = part.as_bytes();
-        let part_l = part.len();
-        if passed_name.len() < part_l || &passed_name[..part_l] != part {
+        let part = match part {
+            "" => "-", // An empty word only appears before or after a non-medial hyphen
+            " " => continue,
+            "-" if codepoint != '\u{1180}' => continue,
+            part => part,
+        };
+
+        if let Some(rest) = cmp_name.strip_prefix(part.as_bytes()) {
+            cmp_name = rest;
+        } else {
             return character_by_alias(search_name);
         }
-        passed_name = &passed_name[part_l..]
+    }
+
+    // HANGUL JUNGSEONG OE is ambiguous with HANGUL JUNGSEONG O-E
+    if codepoint == '\u{116C}' && {
+        let tmp = original_name.trim_ascii_end();
+        tmp[tmp.len() - 3..].eq_ignore_ascii_case("O-E")
+    } {
+        return Some('\u{1180}');
     }
 
     Some(codepoint)
 }
 
+fn normalise(search_name: &str, buf: &mut [u8]) -> usize {
+    let mut cursor = 0;
+    let bytes = search_name.as_bytes();
+
+    for (i, c) in bytes.into_iter().copied().enumerate() {
+        if c.is_ascii_whitespace() || c == b'_' {
+            continue;
+        }
+        if c == b'-'
+            && bytes.get(i - 1).is_some_and(u8::is_ascii_alphanumeric)
+            && bytes.get(i + 1).is_some_and(u8::is_ascii_alphanumeric)
+        {
+            continue;
+        }
+        if !c.is_ascii_alphanumeric() && c != b'-' {
+            return 0;
+        }
+        if cursor >= buf.len() {
+            return 0;
+        }
+        buf[cursor] = c.to_ascii_uppercase();
+        cursor += 1;
+    }
+
+    cursor
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -516,7 +558,7 @@ mod tests {
     #[test]
     fn character_negative() {
         let long_name = "x".repeat(100);
-        assert!(long_name.len() > MAX_NAME_LENGTH); // Otherwise this test is pointless
+        assert!(long_name.len() > generated::MAX_NAME_LENGTH); // Otherwise this test is pointless
         let names = ["", "x", "öäå", "SPAACE", &long_name];
         for &n in names.iter() {
             assert_eq!(character(n), None);
@@ -614,6 +656,27 @@ mod tests {
         assert_eq!(super::character_by_alias(b"NOT AN ALIAS"), None);
     }
 
+    #[test]
+    fn test_uax44() {
+        assert_eq!(character(" L_O_W l_i_n_e"), Some('_'));
+        assert_eq!(character("space \x09\x0a\x0c\x0d"), Some(' '));
+        assert_eq!(character("FULL S-T-O-P"), Some('.'));
+        assert_eq!(character("tibetan letter -a"), Some('\u{F60}'));
+        assert_eq!(character("tibetan letter- a"), Some('\u{F60}'));
+        assert_eq!(character("tibetan letter  -   a"), Some('\u{F60}'));
+        assert_eq!(character("tibetan letter_-_a"), Some('\u{F60}'));
+
+        // Test exceptions related to U+1180
+        let jungseong_oe = Some('\u{116C}');
+        let jungseong_o_e = Some('\u{1180}');
+        assert_eq!(character("HANGUL JUNGSEONG OE"), jungseong_oe);
+        assert_eq!(character("HANGUL JUNGSEONG O-E"), jungseong_o_e);
+        assert_eq!(character("HANGUL JUNGSEONG O E"), jungseong_oe);
+        assert_eq!(character("HANGUL JUNGSEONG O- E"), jungseong_o_e);
+        assert_eq!(character("HANGUL JUNGSEONG O -E"), jungseong_o_e);
+        assert_eq!(character("HANGUL JUNGSEONG O_-_E"), jungseong_o_e);
+    }
+
     #[bench]
     fn name_basic(b: &mut Bencher) {
         b.iter(|| {