@@ -68,7 +68,7 @@ extern crate std;
68
68
69
69
use core:: { char, fmt} ;
70
70
use generated:: {
71
- MAX_NAME_LENGTH , PHRASEBOOK_OFFSETS1 , PHRASEBOOK_OFFSETS2 , PHRASEBOOK_OFFSET_SHIFT ,
71
+ MAX_NORMALISED_NAME_LENGTH , PHRASEBOOK_OFFSETS1 , PHRASEBOOK_OFFSETS2 , PHRASEBOOK_OFFSET_SHIFT ,
72
72
} ;
73
73
74
74
#[ allow( dead_code) ]
@@ -95,7 +95,9 @@ static ALIASES: phf::Map<&'static [u8], char> =
95
95
mod iter_str;
96
96
97
97
static HANGUL_SYLLABLE_PREFIX : & str = "HANGUL SYLLABLE " ;
98
+ static NORMALISED_HANGUL_SYLLABLE_PREFIX : & str = "HANGULSYLLABLE" ;
98
99
static CJK_UNIFIED_IDEOGRAPH_PREFIX : & str = "CJK UNIFIED IDEOGRAPH-" ;
100
+ static NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX : & str = "CJKUNIFIEDIDEOGRAPH" ;
99
101
100
102
fn is_cjk_unified_ideograph ( ch : char ) -> bool {
101
103
generated:: CJK_IDEOGRAPH_RANGES
@@ -336,17 +338,16 @@ fn character_by_alias(name: &[u8]) -> Option<char> {
336
338
/// assert_eq!(unicode_names2::character("nonsense"), None);
337
339
/// ```
338
340
pub fn character ( search_name : & str ) -> Option < char > {
341
+ let original_name = search_name;
339
342
// + 1 so that we properly handle the case when `name` has a
340
343
// prefix of the longest name, but isn't exactly equal.
341
- let mut buf = [ 0 ; MAX_NAME_LENGTH + 1 ] ;
342
- for ( place, byte) in buf. iter_mut ( ) . zip ( search_name. bytes ( ) ) {
343
- * place = byte. to_ascii_uppercase ( ) ;
344
- }
345
- let search_name = buf. get ( ..search_name. len ( ) ) ?;
344
+ let mut buf = [ 0 ; MAX_NORMALISED_NAME_LENGTH + 1 ] ;
345
+ let len = normalise ( search_name, & mut buf[ ..] ) ;
346
+ let search_name = & buf[ ..len] ;
346
347
347
348
// try `HANGUL SYLLABLE <choseong><jungseong><jongseong>`
348
- if search_name. starts_with ( HANGUL_SYLLABLE_PREFIX . as_bytes ( ) ) {
349
- let remaining = & search_name[ HANGUL_SYLLABLE_PREFIX . len ( ) ..] ;
349
+ if search_name. starts_with ( NORMALISED_HANGUL_SYLLABLE_PREFIX . as_bytes ( ) ) {
350
+ let remaining = & search_name[ NORMALISED_HANGUL_SYLLABLE_PREFIX . len ( ) ..] ;
350
351
let ( choseong, remaining) = jamo:: slice_shift_choseong ( remaining) ;
351
352
let ( jungseong, remaining) = jamo:: slice_shift_jungseong ( remaining) ;
352
353
let ( jongseong, remaining) = jamo:: slice_shift_jongseong ( remaining) ;
@@ -364,8 +365,8 @@ pub fn character(search_name: &str) -> Option<char> {
364
365
}
365
366
366
367
// try `CJK UNIFIED IDEOGRAPH-<digits>`
367
- if search_name. starts_with ( CJK_UNIFIED_IDEOGRAPH_PREFIX . as_bytes ( ) ) {
368
- let remaining = & search_name[ CJK_UNIFIED_IDEOGRAPH_PREFIX . len ( ) ..] ;
368
+ if search_name. starts_with ( NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX . as_bytes ( ) ) {
369
+ let remaining = & search_name[ NORMALISED_CJK_UNIFIED_IDEOGRAPH_PREFIX . len ( ) ..] ;
369
370
if remaining. len ( ) > 5 {
370
371
return None ;
371
372
} // avoid overflow
@@ -411,7 +412,7 @@ pub fn character(search_name: &str) -> Option<char> {
411
412
let maybe_name = match name ( codepoint) {
412
413
None => {
413
414
if true {
414
- debug_assert ! ( false )
415
+ debug_assert ! ( false ) // what?
415
416
}
416
417
return character_by_alias ( search_name) ;
417
418
}
@@ -420,19 +421,60 @@ pub fn character(search_name: &str) -> Option<char> {
420
421
421
422
// run through the parts of the name, matching them against the
422
423
// parts of the input.
423
- let mut passed_name = search_name;
424
+ let mut cmp_name = search_name;
424
425
for part in maybe_name {
425
- let part = part. as_bytes ( ) ;
426
- let part_l = part. len ( ) ;
427
- if passed_name. len ( ) < part_l || & passed_name[ ..part_l] != part {
426
+ let part = match part {
427
+ "" => "-" , // An empty word only appears before or after a non-medial hyphen
428
+ " " => continue ,
429
+ "-" if codepoint != '\u{1180}' => continue ,
430
+ part => part,
431
+ } ;
432
+
433
+ if let Some ( rest) = cmp_name. strip_prefix ( part. as_bytes ( ) ) {
434
+ cmp_name = rest;
435
+ } else {
428
436
return character_by_alias ( search_name) ;
429
437
}
430
- passed_name = & passed_name[ part_l..]
438
+ }
439
+
440
+ // HANGUL JUNGSEONG OE is ambiguous with HANGUL JUNGSEONG O-E
441
+ if codepoint == '\u{116C}' && {
442
+ let tmp = original_name. trim_ascii_end ( ) ;
443
+ tmp[ tmp. len ( ) - 3 ..] . eq_ignore_ascii_case ( "O-E" )
444
+ } {
445
+ return Some ( '\u{1180}' ) ;
431
446
}
432
447
433
448
Some ( codepoint)
434
449
}
435
450
451
+ fn normalise ( search_name : & str , buf : & mut [ u8 ] ) -> usize {
452
+ let mut cursor = 0 ;
453
+ let bytes = search_name. as_bytes ( ) ;
454
+
455
+ for ( i, c) in bytes. into_iter ( ) . copied ( ) . enumerate ( ) {
456
+ if c. is_ascii_whitespace ( ) || c == b'_' {
457
+ continue ;
458
+ }
459
+ if c == b'-'
460
+ && bytes. get ( i - 1 ) . is_some_and ( u8:: is_ascii_alphanumeric)
461
+ && bytes. get ( i + 1 ) . is_some_and ( u8:: is_ascii_alphanumeric)
462
+ {
463
+ continue ;
464
+ }
465
+ if !c. is_ascii_alphanumeric ( ) && c != b'-' {
466
+ return 0 ;
467
+ }
468
+ if cursor >= buf. len ( ) {
469
+ return 0 ;
470
+ }
471
+ buf[ cursor] = c. to_ascii_uppercase ( ) ;
472
+ cursor += 1 ;
473
+ }
474
+
475
+ cursor
476
+ }
477
+
436
478
#[ cfg( test) ]
437
479
mod tests {
438
480
use super :: * ;
@@ -516,7 +558,7 @@ mod tests {
516
558
#[ test]
517
559
fn character_negative ( ) {
518
560
let long_name = "x" . repeat ( 100 ) ;
519
- assert ! ( long_name. len( ) > MAX_NAME_LENGTH ) ; // Otherwise this test is pointless
561
+ assert ! ( long_name. len( ) > generated :: MAX_NAME_LENGTH ) ; // Otherwise this test is pointless
520
562
let names = [ "" , "x" , "öäå" , "SPAACE" , & long_name] ;
521
563
for & n in names. iter ( ) {
522
564
assert_eq ! ( character( n) , None ) ;
@@ -614,6 +656,27 @@ mod tests {
614
656
assert_eq ! ( super :: character_by_alias( b"NOT AN ALIAS" ) , None ) ;
615
657
}
616
658
659
+ #[ test]
660
+ fn test_uax44 ( ) {
661
+ assert_eq ! ( character( " L_O_W l_i_n_e" ) , Some ( '_' ) ) ;
662
+ assert_eq ! ( character( "space \x09 \x0a \x0c \x0d " ) , Some ( ' ' ) ) ;
663
+ assert_eq ! ( character( "FULL S-T-O-P" ) , Some ( '.' ) ) ;
664
+ assert_eq ! ( character( "tibetan letter -a" ) , Some ( '\u{F60}' ) ) ;
665
+ assert_eq ! ( character( "tibetan letter- a" ) , Some ( '\u{F60}' ) ) ;
666
+ assert_eq ! ( character( "tibetan letter - a" ) , Some ( '\u{F60}' ) ) ;
667
+ assert_eq ! ( character( "tibetan letter_-_a" ) , Some ( '\u{F60}' ) ) ;
668
+
669
+ // Test exceptions related to U+1180
670
+ let jungseong_oe = Some ( '\u{116C}' ) ;
671
+ let jungseong_o_e = Some ( '\u{1180}' ) ;
672
+ assert_eq ! ( character( "HANGUL JUNGSEONG OE" ) , jungseong_oe) ;
673
+ assert_eq ! ( character( "HANGUL JUNGSEONG O-E" ) , jungseong_o_e) ;
674
+ assert_eq ! ( character( "HANGUL JUNGSEONG O E" ) , jungseong_oe) ;
675
+ assert_eq ! ( character( "HANGUL JUNGSEONG O- E" ) , jungseong_o_e) ;
676
+ assert_eq ! ( character( "HANGUL JUNGSEONG O -E" ) , jungseong_o_e) ;
677
+ assert_eq ! ( character( "HANGUL JUNGSEONG O_-_E" ) , jungseong_o_e) ;
678
+ }
679
+
617
680
#[ bench]
618
681
fn name_basic ( b : & mut Bencher ) {
619
682
b. iter ( || {
0 commit comments