Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions generator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ fn create_lexicon_and_offsets(
// insert the suffixes of this word which saves about
// 10KB (we could theoretically insert all substrings,
// upto a certain length, but this only saves ~300
// bytes or so and is noticably slower).
// bytes or so and is noticeably slower).
for i in 1..n.len() {
if t.insert(n[i..].bytes(), Some(offset + i), true).0 {
// once we've found a string that's already
Expand Down Expand Up @@ -307,9 +307,9 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)
// currently huge, but it has a lot of 0's, so we compress it
// using the binning, below.
let mut phrasebook_offsets = repeat(0).take(0x10FFFF + 1).collect::<Vec<_>>();
let mut longest_name = 0;
let mut longest_name = String::new();
for &(cp, name) in codepoint_names.iter() {
longest_name = cmp::max(name.len(), longest_name);
longest_name = cmp::max_by_key(normalise_name(name, cp), longest_name, |s| s.len());

let start = phrasebook.len() as u32;
phrasebook_offsets[cp as usize] = start;
Expand Down Expand Up @@ -337,8 +337,8 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, &str)>)

w!(
ctxt,
"pub const MAX_NAME_LENGTH: usize = {};\n",
longest_name
"pub const LONGEST_NAME: &str = {longest_name:?};\n\
pub const LONGEST_NAME_LEN: usize = LONGEST_NAME.len();\n"
);
ctxt.write_plain_string("LEXICON", &lexicon_string);
ctxt.write_debugs("LEXICON_OFFSETS", "u32", &lexicon_offsets);
Expand Down Expand Up @@ -407,6 +407,11 @@ pub fn generate_phf(
) {
let (codepoint_names, _) = get_truncated_table_data(unicode_data, truncate);

let codepoint_names: Vec<_> = codepoint_names
.into_iter()
.map(|(c, s)| (c, normalise_name(s, c)))
.collect();

let mut ctxt = make_context(path);
let (n, disps, data) = phf::create_phf(&codepoint_names, lambda, tries);

Expand All @@ -420,6 +425,41 @@ pub fn generate_phf(
}
}

/// Convert a Unicode name to a form that can be used for loose matching, as per
/// [UAX#44](https://www.unicode.org/reports/tr44/tr44-34.html#Matching_Names)
///
/// This function matches `unicode_names2::normalise_name` in implementation, thus the result of one
/// can be used to query a PHF generated from the other.
fn normalise_name(s: &str, codepoint: char) -> String {
let mut normalised = String::new();
let bytes = s.as_bytes();
for (i, c) in bytes.iter().map(u8::to_ascii_uppercase).enumerate() {
// "Ignore case, whitespace, underscore ('_'), [...]"
if c.is_ascii_whitespace() || c == b'_' {
continue;
}

// "[...] and all medial hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E."
if codepoint != '\u{1180}' // HANGUL JUNGSEONG O-E
&& c == b'-'
&& bytes.get(i - 1).map_or(false, u8::is_ascii_alphanumeric)
&& bytes.get(i + 1).map_or(false, u8::is_ascii_alphanumeric)
{
continue;
}
assert!(
c.is_ascii_alphanumeric() || c == b'-',
"U+{:04X} contains an invalid character for a Unicode name: {:?}",
codepoint as u32,
s
);

normalised.push(c as char);
}

normalised
}

pub fn generate(unicode_data: &'static str, path: Option<&Path>, truncate: Option<usize>) {
let (codepoint_names, cjk) = get_truncated_table_data(unicode_data, truncate);
let mut ctxt = make_context(path);
Expand Down
6 changes: 3 additions & 3 deletions generator/src/phf.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Computes a perfect hash table using [the CHD
//! algorithm](http://cmph.sourceforge.net/papers/esa09.pdf).
//!
//! Strongly inspired by https://github.com/sfackler/rust-phf

Check warning on line 4 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test generator (nightly)

this URL is not a hyperlink

use rand::prelude::{Rng, SeedableRng, SliceRandom, StdRng};
use std::iter::repeat;
Expand Down Expand Up @@ -40,14 +40,14 @@

#[allow(clippy::type_complexity)]
fn try_phf_table(
values: &[(char, &str)],
values: &[(char, String)],
lambda: usize,
seed: u64,
rng: &mut StdRng,
) -> Option<(Vec<(u32, u32)>, Vec<char>)> {
let hashes: Vec<_> = values
.iter()
.map(|&(n, s)| (split(hash(s, seed)), n))
.map(|(n, s)| (split(hash(s, seed)), *n))
.collect();

let table_len = hashes.len();
Expand Down Expand Up @@ -140,17 +140,17 @@
}

pub fn create_phf(
data: &[(char, &str)],
data: &[(char, String)],
lambda: usize,
max_tries: usize,
) -> (u64, Vec<(u32, u32)>, Vec<char>) {
let mut rng = StdRng::seed_from_u64(0xf0f0f0f0);
#[cfg(feature = "timing")]
let start = time::Instant::now();

Check warning on line 149 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

Check warning on line 149 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

Check warning on line 149 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

Check warning on line 149 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

for i in 0..(max_tries) {
#[cfg(feature = "timing")]
let my_start = time::Instant::now();

Check warning on line 153 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

Check warning on line 153 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

Check warning on line 153 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead

Check warning on line 153 in generator/src/phf.rs

View workflow job for this annotation

GitHub Actions / Test on nightly (nightly, --all-features)

use of deprecated struct `time::Instant`: import `std::time::Instant` and `time::ext::InstantExt` instead
#[cfg(feature = "timing")]
println!("PHF #{}: starting {:.2}", i, my_start - start);
#[cfg(not(feature = "timing"))]
Expand Down
Loading
Loading