Skip to content

Commit 9865ad4

Browse files
committed
Implement UAX#44-LM2
1 parent 598db07 commit 9865ad4

File tree

3 files changed

+118
-23
lines changed

3 files changed

+118
-23
lines changed

generator/src/lib.rs

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ fn create_lexicon_and_offsets(
156156
// insert the suffixes of this word which saves about
157157
// 10KB (we could theoretically insert all substrings,
158158
// upto a certain length, but this only saves ~300
159-
// bytes or so and is noticably slower).
159+
// bytes or so and is noticeably slower).
160160
for i in 1..n.len() {
161161
if t.insert(n[i..].bytes(), Some(offset + i), true).0 {
162162
// once we've found a string that's already
@@ -308,8 +308,10 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
308308
// using the binning, below.
309309
let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::<Vec<_>>();
310310
let mut longest_name = 0;
311+
let mut longest_normalised_name = 0;
311312
for &(cp, name) in codepoint_names.iter() {
312313
longest_name = cmp::max(name.len(), longest_name);
314+
longest_normalised_name = cmp::max(normalise_name(name, cp).len(), longest_normalised_name);
313315

314316
let start = phrasebook.len() as u32;
315317
phrasebook_offsets[cp as usize] = start;
@@ -337,8 +339,8 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
337339

338340
w!(
339341
ctxt,
340-
"pub const MAX_NAME_LENGTH: usize = {};\n",
341-
longest_name
342+
"pub const MAX_NAME_LENGTH: usize = {longest_name};\n\
343+
pub const MAX_NORMALISED_NAME_LENGTH: usize = {longest_normalised_name};\n",
342344
);
343345
ctxt.write_plain_string("LEXICON", &lexicon_string);
344346
ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets);
@@ -407,6 +409,11 @@ pub fn generate_phf(
407409
) {
408410
let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate);
409411

412+
let codepoint_names: Vec<_> = codepoint_names
413+
.into_iter()
414+
.map(|(c, s)| (c, normalise_name(s, c)))
415+
.collect();
416+
410417
let mut ctxt = make_context(path);
411418
let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries);
412419

@@ -420,6 +427,31 @@ pub fn generate_phf(
420427
}
421428
}
422429

430+
fn normalise_name(s: &str, codepoint: char) -> String {
431+
let mut normalised = String::new();
432+
let bytes = s.as_bytes();
433+
for (i, c) in bytes.into_iter().copied().enumerate() {
434+
if c.is_ascii_whitespace() || c == b'_' {
435+
continue;
436+
}
437+
if codepoint != '\u{1180}' // HANGUL JUNGSEONG O-E
438+
&& c == b'-'
439+
&& bytes.get(i - 1).is_some_and(u8::is_ascii_alphanumeric)
440+
&& bytes.get(i + 1).is_some_and(u8::is_ascii_alphanumeric)
441+
{
442+
continue;
443+
}
444+
assert!(
445+
c.is_ascii_alphanumeric() || c == b'-',
446+
"{:?} isn't a valid character for a Unicode name",
447+
c as char
448+
);
449+
normalised.push(c.to_ascii_uppercase() as char);
450+
}
451+
452+
normalised
453+
}
454+
423455
pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option<usize>) {
424456
let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate);
425457
let mut ctxt = make_context(path);

generator/src/phf.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,14 @@ struct Hash {
4040

4141
#[allow(clippy::type_complexity)]
4242
fn try_phf_table(
43-
values: &[(char, &str)],
43+
values: &[(char, String)],
4444
lambda: usize,
4545
seed: u64,
4646
rng: &mut StdRng,
4747
) -> Option<(Vec<(u32, u32)>, Vec<char>)> {
4848
let hashes: Vec<_> = values
4949
.iter()
50-
.map(|&(n, s)| (split(hash(s, seed)), n))
50+
.map(|(n, s)| (split(hash(&s, seed)), *n))
5151
.collect();
5252

5353
let table_len = hashes.len();
@@ -140,7 +140,7 @@ fn try_phf_table(
140140
}
141141

142142
pub fn create_phf(
143-
data: &[(char, &str)],
143+
data: &[(char, String)],
144144
lambda: usize,
145145
max_tries: usize,
146146
) -> (u64, Vec<(u32, u32)>, Vec<char>) {

src/lib.rs

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ extern crate std;
6868

6969
use core::{char, fmt};
7070
use generated::{
71-
MAX_NAME_LENGTH, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT,
71+
MAX_NORMALISED_NAME_LENGTH, PHRASEBOOK_OFFSETS1, PHRASEBOOK_OFFSETS2, PHRASEBOOK_OFFSET_SHIFT,
7272
};
7373

7474
#[allow(dead_code)]
@@ -95,7 +95,9 @@ static ALIASES: phf::Map<&'static [u8], char> =
9595
mod iter_str;
9696

9797
static HANGUL_SYLLABLE_PREFIX: &str = "HANGUL SYLLABLE ";
98+
static NORMALISED_HANGUL_SYLLABLE_PREFIX: &str = "HANGULSYLLABLE";
9899
static CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJK UNIFIED IDEOGRAPH-";
100+
static NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX: &str = "CJKUNIFIEDIDEOGRAPH";
99101

100102
fn is_cjk_unified_ideograph(ch: char) -> bool {
101103
generated::CJK_IDEOGRAPH_RANGES
@@ -336,17 +338,16 @@ fn character_by_alias(name: &[u8]) -> Option<char> {
336338
/// assert_eq!(unicode_names2::character("nonsense"), None);
337339
/// ```
338340
pub fn character(search_name: &str) -> Option<char> {
341+
let original_name = search_name;
339342
// + 1 so that we properly handle the case when `name` has a
340343
// prefix of the longest name, but isn't exactly equal.
341-
let mut buf = [0; MAX_NAME_LENGTH + 1];
342-
for (place, byte) in buf.iter_mut().zip(search_name.bytes()) {
343-
*place = byte.to_ascii_uppercase();
344-
}
345-
let search_name = buf.get(..search_name.len())?;
344+
let mut buf = [0; MAX_NORMALISED_NAME_LENGTH + 1];
345+
let len = normalise(search_name, &mut buf[..]);
346+
let search_name = &buf[..len];
346347

347348
// try `HANGUL SYLLABLE <choseong><jungseong><jongseong>`
348-
if search_name.starts_with(HANGUL_SYLLABLE_PREFIX.as_bytes()) {
349-
let remaining = &search_name[HANGUL_SYLLABLE_PREFIX.len()..];
349+
if search_name.starts_with(NORMALISED_HANGUL_SYLLABLE_PREFIX.as_bytes()) {
350+
let remaining = &search_name[NORMALISED_HANGUL_SYLLABLE_PREFIX.len()..];
350351
let (choseong, remaining) = jamo::slice_shift_choseong(remaining);
351352
let (jungseong, remaining) = jamo::slice_shift_jungseong(remaining);
352353
let (jongseong, remaining) = jamo::slice_shift_jongseong(remaining);
@@ -364,8 +365,8 @@ pub fn character(search_name: &str) -> Option<char> {
364365
}
365366

366367
// try `CJK UNIFIED IDEOGRAPH-<digits>`
367-
if search_name.starts_with(CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) {
368-
let remaining = &search_name[CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..];
368+
if search_name.starts_with(NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.as_bytes()) {
369+
let remaining = &search_name[NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX.len()..];
369370
if remaining.len() > 5 {
370371
return None;
371372
} // avoid overflow
@@ -411,7 +412,7 @@ pub fn character(search_name: &str) -> Option<char> {
411412
let maybe_name = match name(codepoint) {
412413
None => {
413414
if true {
414-
debug_assert!(false)
415+
debug_assert!(false) // what?
415416
}
416417
return character_by_alias(search_name);
417418
}
@@ -420,19 +421,60 @@ pub fn character(search_name: &str) -> Option<char> {
420421

421422
// run through the parts of the name, matching them against the
422423
// parts of the input.
423-
let mut passed_name = search_name;
424+
let mut cmp_name = search_name;
424425
for part in maybe_name {
425-
let part = part.as_bytes();
426-
let part_l = part.len();
427-
if passed_name.len() < part_l || &passed_name[..part_l] != part {
426+
let part = match part {
427+
"" => "-", // An empty word only appears before or after a non-medial hyphen
428+
" " => continue,
429+
"-" if codepoint != '\u{1180}' => continue,
430+
part => part,
431+
};
432+
433+
if let Some(rest) = cmp_name.strip_prefix(part.as_bytes()) {
434+
cmp_name = rest;
435+
} else {
428436
return character_by_alias(search_name);
429437
}
430-
passed_name = &passed_name[part_l..]
438+
}
439+
440+
// HANGUL JUNGSEONG OE is ambiguous with HANGUL JUNGSEONG O-E
441+
if codepoint == '\u{116C}' && {
442+
let tmp = original_name.trim_ascii_end();
443+
tmp[tmp.len() - 3..].eq_ignore_ascii_case("O-E")
444+
} {
445+
return Some('\u{1180}');
431446
}
432447

433448
Some(codepoint)
434449
}
435450

451+
fn normalise(search_name: &str, buf: &mut [u8]) -> usize {
452+
let mut cursor = 0;
453+
let bytes = search_name.as_bytes();
454+
455+
for (i, c) in bytes.into_iter().copied().enumerate() {
456+
if c.is_ascii_whitespace() || c == b'_' {
457+
continue;
458+
}
459+
if c == b'-'
460+
&& bytes.get(i - 1).is_some_and(u8::is_ascii_alphanumeric)
461+
&& bytes.get(i + 1).is_some_and(u8::is_ascii_alphanumeric)
462+
{
463+
continue;
464+
}
465+
if !c.is_ascii_alphanumeric() && c != b'-' {
466+
return 0;
467+
}
468+
if cursor >= buf.len() {
469+
return 0;
470+
}
471+
buf[cursor] = c.to_ascii_uppercase();
472+
cursor += 1;
473+
}
474+
475+
cursor
476+
}
477+
436478
#[cfg(test)]
437479
mod tests {
438480
use super::*;
@@ -516,7 +558,7 @@ mod tests {
516558
#[test]
517559
fn character_negative() {
518560
let long_name = "x".repeat(100);
519-
assert!(long_name.len() > MAX_NAME_LENGTH); // Otherwise this test is pointless
561+
assert!(long_name.len() > generated::MAX_NAME_LENGTH); // Otherwise this test is pointless
520562
let names = ["", "x", "öäå", "SPAACE", &long_name];
521563
for &n in names.iter() {
522564
assert_eq!(character(n), None);
@@ -614,6 +656,27 @@ mod tests {
614656
assert_eq!(super::character_by_alias(b"NOT AN ALIAS"), None);
615657
}
616658

659+
#[test]
660+
fn test_uax44() {
661+
assert_eq!(character(" L_O_W l_i_n_e"), Some('_'));
662+
assert_eq!(character("space \x09\x0a\x0c\x0d"), Some(' '));
663+
assert_eq!(character("FULL S-T-O-P"), Some('.'));
664+
assert_eq!(character("tibetan letter -a"), Some('\u{F60}'));
665+
assert_eq!(character("tibetan letter- a"), Some('\u{F60}'));
666+
assert_eq!(character("tibetan letter - a"), Some('\u{F60}'));
667+
assert_eq!(character("tibetan letter_-_a"), Some('\u{F60}'));
668+
669+
// Test exceptions related to U+1180
670+
let jungseong_oe = Some('\u{116C}');
671+
let jungseong_o_e = Some('\u{1180}');
672+
assert_eq!(character("HANGUL JUNGSEONG OE"), jungseong_oe);
673+
assert_eq!(character("HANGUL JUNGSEONG O-E"), jungseong_o_e);
674+
assert_eq!(character("HANGUL JUNGSEONG O E"), jungseong_oe);
675+
assert_eq!(character("HANGUL JUNGSEONG O- E"), jungseong_o_e);
676+
assert_eq!(character("HANGUL JUNGSEONG O -E"), jungseong_o_e);
677+
assert_eq!(character("HANGUL JUNGSEONG O_-_E"), jungseong_o_e);
678+
}
679+
617680
#[bench]
618681
fn name_basic(b: &mut Bencher) {
619682
b.iter(|| {

0 commit comments

Comments
 (0)