diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index d2073f86c01..c1eff3a36e6 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -544,18 +544,26 @@ pub fn lookup(c: char) -> bool { #[rustfmt::skip] pub mod white_space { - static SHORT_OFFSET_RUNS: [u32; 4] = [ - 5760, 18882560, 23080960, 40972289, - ]; - static OFFSETS: [u8; 21] = [ - 9, 5, 18, 1, 100, 1, 26, 1, 0, 1, 0, 11, 29, 2, 5, 1, 47, 1, 0, 1, 0, + static WHITESPACE_MAP: [u8; 256] = [ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { - super::skip_search( - c as u32, - &SHORT_OFFSET_RUNS, - &OFFSETS, - ) + match c as u32 >> 8 { + 0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0, + 22 => c as u32 == 0x1680, + 32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0, + 48 => c as u32 == 0x3000, + _ => false, + } } } diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs new file mode 100644 index 00000000000..02c7542309a --- /dev/null +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -0,0 +1,78 @@ +use crate::fmt_list; +use crate::raw_emitter::RawEmitter; +use std::collections::HashMap; +use std::fmt::Write as _; +use std::ops::Range; + +impl RawEmitter { + pub fn emit_cascading_map(&mut self, ranges: &[Range]) -> bool { + let mut map: [u8; 256] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + let points = ranges + .iter() + .flat_map(|r| (r.start..r.end).into_iter().collect::>()) + .collect::>(); + + println!("there are {} points", points.len()); + + // how many distinct ranges need to be counted? + let mut codepoints_by_high_bytes = HashMap::>::new(); + for point in points { + // assert that there is no whitespace over the 0x3000 range. + assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); + let high_bytes = point as usize >> 8; + let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new); + codepoints.push(point); + } + + let mut bit_for_high_byte = 1u8; + let mut arms = Vec::::new(); + + let mut high_bytes: Vec = + codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); + high_bytes.sort(); + for high_byte in high_bytes { + let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); + if codepoints.len() == 1 { + let ch = codepoints.pop().unwrap(); + arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch)); + continue; + } + // more than 1 codepoint in this arm + for codepoint in codepoints { + map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; + } + arms.push(format!( + "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0", + high_byte, bit_for_high_byte + )); + bit_for_high_byte <<= 1; + } + + writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) + .unwrap(); + self.bytes_used += 256; + + writeln!(&mut self.file, "#[inline]").unwrap(); + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); + for arm in arms { + writeln!(&mut self.file, " {},", arm).unwrap(); + } + writeln!(&mut self.file, " _ => false,").unwrap(); + writeln!(&mut self.file, " }}").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + + true + } +} diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 4720ee7020f..a3327a3c2ff 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -75,12 +75,13 @@ use std::ops::Range; use ucd_parse::Codepoints; +mod cascading_map; mod case_mapping; mod raw_emitter; mod skiplist; mod unicode_download; -use raw_emitter::{emit_codepoints, RawEmitter}; +use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter}; static PROPERTIES: &[&str] = &[ "Alphabetic", @@ -241,8 +242,13 @@ fn main() { let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { let datapoints = ranges.iter().map(|r| r.end - r.start).sum::(); + let mut emitter = RawEmitter::new(); - emit_codepoints(&mut emitter, &ranges); + if property == &"White_Space" { + emit_whitespace(&mut emitter, &ranges); + } else { + emit_codepoints(&mut emitter, &ranges); + } modules.push((property.to_lowercase().to_string(), emitter.file)); println!( diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index ab8eaee9541..5aca86ba089 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -170,6 +170,15 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { } } +pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { + emitter.blank_line(); + + let mut cascading = emitter.clone(); + cascading.emit_cascading_map(&ranges); + *emitter = cascading; + emitter.desc = String::from("cascading"); +} + struct Canonicalized { canonical_words: Vec, canonicalized_words: Vec<(u8, u8)>,