Skip to content

Commit d6b2923

Browse files
authored
Implement UAX#44-LM2 for name normalization (#46)
This follows UAX's example procedure: > An implementation of this loose matching rule can obtain the correct results when comparing two strings by doing the following three operations, in order: > * remove all medial hyphens (except the medial hyphen in the name for U+1180) > * remove all whitespace and underscore characters > * apply toLowercase() to both strings > * After applying these three operations, if the two strings compare binary equal, then they are considered to match. and generating the name->codepoint PHF with the mapped ("normalised") names as keys. This isn't a breaking change; in particular, any name that mapped to a character before will continue mapping to that same character.
1 parent 6757ad2 commit d6b2923

File tree

3 files changed

+203
-53
lines changed

3 files changed

+203
-53
lines changed

generator/src/lib.rs

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ fn create_lexicon_and_offsets(
156156
// insert the suffixes of this word which saves about
157157
// 10KB (we could theoretically insert all substrings,
158158
// upto a certain length, but this only saves ~300
159-
// bytes or so and is noticably slower).
159+
// bytes or so and is noticeably slower).
160160
for i in 1..n.len() {
161161
if t.insert(n[i..].bytes(), Some(offset + i), true).0 {
162162
// once we've found a string that's already
@@ -307,9 +307,9 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
307307
// currently huge, but it has a lot of 0's, so we compress it
308308
// using the binning, below.
309309
let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::<Vec<_>>();
310-
let mut longest_name = 0;
310+
let mut longest_name = String::new();
311311
for &(cp, name) in codepoint_names.iter() {
312-
longest_name = cmp::max(name.len(), longest_name);
312+
longest_name = cmp::max_by_key(normalise_name(name, cp), longest_name, |s| s.len());
313313

314314
let start = phrasebook.len() as u32;
315315
phrasebook_offsets[cp as usize] = start;
@@ -337,8 +337,8 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
337337

338338
w!(
339339
ctxt,
340-
"pub const MAX_NAME_LENGTH: usize = {};\n",
341-
longest_name
340+
"pub const LONGEST_NAME: &str = {longest_name:?};\n\
341+
pub const LONGEST_NAME_LEN: usize = LONGEST_NAME.len();\n"
342342
);
343343
ctxt.write_plain_string("LEXICON", &lexicon_string);
344344
ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets);
@@ -407,6 +407,11 @@ pub fn generate_phf(
407407
) {
408408
let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate);
409409

410+
let codepoint_names: Vec<_> = codepoint_names
411+
.into_iter()
412+
.map(|(c, s)| (c, normalise_name(s, c)))
413+
.collect();
414+
410415
let mut ctxt = make_context(path);
411416
let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries);
412417

@@ -420,6 +425,41 @@ pub fn generate_phf(
420425
}
421426
}
422427

428+
/// Convert a Unicode name to a form that can be used for loose matching, as per
429+
/// [UAX#44](https://www.unicode.org/reports/tr44/tr44-34.html#Matching_Names)
430+
///
431+
/// This function matches `unicode_names2::normalise_name` in implementation, thus the result of one
432+
/// can be used to query a PHF generated from the other.
433+
fn normalise_name(s: &str, codepoint: char) -> String {
434+
let mut normalised = String::new();
435+
let bytes = s.as_bytes();
436+
for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() {
437+
// "Ignore case, whitespace, underscore ('_'), [...]"
438+
if c.is_ascii_whitespace() || c == b'_' {
439+
continue;
440+
}
441+
442+
// "[...] and all medial hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E."
443+
if codepoint != '\u{1180}' // HANGUL JUNGSEONG O-E
444+
&& c == b'-'
445+
&& bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric)
446+
&& bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric)
447+
{
448+
continue;
449+
}
450+
assert!(
451+
c.is_ascii_alphanumeric() || c == b'-',
452+
"U+{:04X} contains an invalid character for a Unicode name: {:?}",
453+
codepoint as u32,
454+
s
455+
);
456+
457+
normalised.push(c as char);
458+
}
459+
460+
normalised
461+
}
462+
423463
pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option<usize>) {
424464
let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate);
425465
let mut ctxt = make_context(path);

generator/src/phf.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,14 @@ struct Hash {
4040

4141
#[allow(clippy::type_complexity)]
4242
fn try_phf_table(
43-
values: &[(char, &str)],
43+
values: &[(char, String)],
4444
lambda: usize,
4545
seed: u64,
4646
rng: &mut StdRng,
4747
) -> Option<(Vec<(u32, u32)>, Vec<char>)> {
4848
let hashes: Vec<_> = values
4949
.iter()
50-
.map(|&(n, s)| (split(hash(s, seed)), n))
50+
.map(|(n, s)| (split(hash(s, seed)), *n))
5151
.collect();
5252

5353
let table_len = hashes.len();
@@ -140,7 +140,7 @@ fn try_phf_table(
140140
}
141141

142142
pub fn create_phf(
143-
data: &[(char, &str)],
143+
data: &[(char, String)],
144144
lambda: usize,
145145
max_tries: usize,
146146
) -> (u64, Vec<(u32, u32)>, Vec<char>) {

0 commit comments

Comments
 (0)