From 9c1ceece20e14e2a746c382b639f9288746e493c Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Wed, 25 Mar 2020 21:00:01 -0400 Subject: [PATCH] Add skip list based implementation for smaller encoding This arranges for the sparser sets (everything except lower and uppercase) to be encoded in a significantly smaller context. However, it is also a performance trade-off (roughly 3x slower than the bitset encoding). The 40% size reduction is deemed to be sufficiently important to merit this performance loss, particularly as it is unlikely that this code is hot anywhere (and if it is, paying the memory cost for a bitset that directly represents the data seems worthwhile). Alphabetic : 1599 bytes (- 937 bytes) Case_Ignorable : 949 bytes (- 822 bytes) Cased : 359 bytes (- 429 bytes) Cc : 9 bytes (- 15 bytes) Grapheme_Extend: 813 bytes (- 675 bytes) Lowercase : 863 bytes N : 419 bytes (- 619 bytes) Uppercase : 776 bytes White_Space : 37 bytes (- 46 bytes) Total table sizes: 5824 bytes (-3543 bytes) --- src/libcore/unicode/unicode_data.rs | 1021 ++++------------- src/tools/unicode-table-generator/src/main.rs | 42 +- .../src/range_search.rs | 51 +- .../src/raw_emitter.rs | 73 +- .../unicode-table-generator/src/skiplist.rs | 98 ++ 5 files changed, 466 insertions(+), 819 deletions(-) create mode 100644 src/tools/unicode-table-generator/src/skiplist.rs diff --git a/src/libcore/unicode/unicode_data.rs b/src/libcore/unicode/unicode_data.rs index 660b91b6025..72ea8ce0381 100644 --- a/src/libcore/unicode/unicode_data.rs +++ b/src/libcore/unicode/unicode_data.rs @@ -1,7 +1,7 @@ ///! This file is generated by src/tools/unicode-table-generator; do not edit manually! #[inline(always)] -fn range_search< +fn bitset_search< const N: usize, const CHUNK_SIZE: usize, const N1: usize, @@ -50,720 +50,267 @@ fn range_search< (word & (1 << (needle % 64) as u64)) != 0 } +fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { + short_offset_run_header & ((1 << 21) - 1) +} + +fn decode_length(short_offset_run_header: u32) -> usize { + (short_offset_run_header >> 21) as usize +} + +#[inline(always)] +fn skip_search( + needle: u32, + short_offset_runs: &[u32; SOR], + offsets: &[u8; OFFSETS], +) -> bool { + // Note that this *cannot* be past the end of the array, as the last + // element is greater than std::char::MAX (the largest possible needle). + // + // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct + // location cannot be past it, so Err(idx) != length either. + // + // This means that we can avoid bounds checking for the accesses below, too. + let last_idx = + match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }; + + let mut offset_idx = decode_length(short_offset_runs[last_idx]); + let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { + decode_length(*next) - offset_idx + } else { + offsets.len() - offset_idx + }; + let prev = + last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0); + + let total = needle - prev; + let mut prefix_sum = 0; + for _ in 0..(length - 1) { + let offset = offsets[offset_idx]; + prefix_sum += offset as u32; + if prefix_sum > total { + break; + } + offset_idx += 1; + } + offset_idx % 2 == 1 +} + pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0); #[rustfmt::skip] pub mod alphabetic { - const BITSET_LAST_CHUNK_MAP: u16 = 393; - static BITSET_CHUNKS_MAP: [u8; 394] = [ - 11, 35, 32, 14, 25, 18, 17, 74, 16, 29, 12, 61, 15, 73, 66, 36, 9, 0, 6, 0, 0, 0, 70, 64, - 22, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 42, 39, 39, 53, 26, 28, 65, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 60, 48, 8, 19, 5, 34, 47, 20, 24, 57, 7, 55, 21, 31, 69, 67, 71, 13, 3, - 39, 43, 58, 0, 0, 0, 0, 0, 39, 39, 63, 0, 0, 0, 0, 0, 0, 0, 39, 56, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 39, 68, 0, 10, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 41, 39, - 39, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 52, 0, 0, 0, 0, 30, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 59, 54, 0, 0, 0, 0, 27, 4, 0, 0, 49, 0, 0, 23, 2, 0, 0, 0, 0, 0, 0, - 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 51, 39, 39, 39, 39, 39, 39, 39, - 46, 72, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 33, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, - 39, 39, 39, 40, 0, 0, 0, 0, 0, 0, 39, 62, 0, 0, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, + static SHORT_OFFSET_RUNS: [u32; 52] = [ + 706, 33559113, 868226669, 947920662, 1157637302, 1306536960, 1310732293, 1398813696, + 1449151936, 1451270141, 1455465613, 1459660301, 1468061604, 1648425216, 1658911342, + 1661009214, 1707147904, 1793132343, 1853951616, 1994464256, 2330009312, 2418090906, + 2428579840, 2439066671, 2441167872, 2443265607, 2445371392, 2447469113, 2449567296, + 2476836856, 2508295382, 2512498688, 2518790431, 2520888060, 2533473280, 2535576576, + 2556548774, 2634145792, 2682380992, 2715936768, 2720132608, 2736910640, 2875326464, + 2887952094, 2890053429, 2894253730, 2902649825, 2906847232, 2908944926, 2911043584, + 2913145675, 2916356939, ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 172, 172, 172, 172], [0, 0, 0, 0, 243, 10, 180, 0], - [0, 0, 0, 124, 0, 0, 203, 0], [0, 0, 0, 199, 0, 0, 0, 0], - [0, 0, 24, 185, 242, 112, 231, 168], [0, 0, 55, 197, 0, 0, 0, 0], - [0, 0, 141, 0, 46, 177, 243, 123], [0, 54, 172, 214, 113, 34, 216, 163], - [0, 83, 241, 0, 62, 29, 179, 0], [0, 172, 0, 0, 172, 4, 159, 142], - [0, 249, 116, 3, 172, 172, 172, 172], [1, 172, 172, 172, 172, 172, 172, 172], - [14, 51, 125, 0, 79, 35, 166, 0], [26, 37, 172, 80, 6, 5, 204, 115], - [30, 211, 40, 208, 120, 132, 239, 180], [59, 6, 23, 60, 172, 172, 172, 172], - [67, 157, 68, 139, 66, 58, 99, 136], [75, 128, 69, 106, 71, 143, 74, 167], - [78, 254, 172, 212, 0, 207, 0, 0], [82, 122, 192, 130, 117, 0, 7, 0], - [94, 0, 44, 196, 70, 156, 0, 0], [105, 1, 31, 218, 48, 172, 28, 243], - [111, 93, 109, 0, 0, 0, 0, 0], [127, 102, 190, 154, 208, 137, 186, 0], - [147, 149, 53, 43, 217, 50, 72, 107], [148, 13, 172, 202, 32, 172, 233, 52], - [150, 0, 0, 0, 97, 183, 0, 0], [152, 206, 172, 64, 41, 101, 221, 89], - [172, 22, 88, 19, 20, 189, 244, 248], [172, 103, 161, 0, 0, 0, 0, 0], - [172, 158, 172, 171, 0, 0, 87, 245], [172, 172, 8, 172, 222, 27, 76, 138], - [172, 172, 11, 172, 172, 172, 172, 172], [172, 172, 12, 108, 247, 194, 0, 0], - [172, 172, 172, 145, 0, 77, 33, 219], [172, 172, 172, 172, 9, 96, 91, 104], - [172, 172, 172, 172, 172, 172, 47, 238], [172, 172, 172, 172, 172, 172, 172, 0], - [172, 172, 172, 172, 172, 172, 172, 172], [172, 172, 172, 172, 172, 172, 172, 193], - [172, 172, 172, 172, 172, 172, 172, 210], [172, 172, 172, 172, 172, 172, 172, 214], - [172, 172, 172, 172, 172, 172, 188, 0], [172, 172, 172, 172, 172, 181, 0, 0], - [172, 172, 172, 172, 192, 45, 172, 172], [172, 172, 172, 172, 207, 172, 172, 172], - [172, 172, 172, 172, 209, 153, 0, 0], [172, 172, 172, 172, 215, 6, 232, 110], - [172, 172, 172, 176, 172, 170, 0, 0], [172, 172, 172, 187, 179, 0, 0, 0], - [172, 172, 172, 191, 172, 172, 172, 172], [172, 172, 172, 213, 0, 0, 0, 0], - [172, 172, 182, 251, 172, 172, 172, 172], [172, 172, 230, 61, 235, 236, 237, 234], - [172, 177, 118, 151, 205, 126, 172, 164], [172, 178, 0, 0, 0, 0, 0, 0], - [172, 179, 205, 205, 195, 0, 0, 0], [172, 200, 172, 172, 172, 175, 0, 0], - [172, 225, 63, 226, 90, 17, 172, 172], [172, 228, 172, 188, 92, 16, 204, 18], - [172, 229, 25, 119, 133, 134, 2, 165], [191, 0, 0, 0, 0, 0, 0, 0], - [200, 0, 0, 0, 0, 0, 0, 0], [201, 0, 0, 0, 0, 0, 0, 0], - [209, 56, 216, 129, 38, 42, 172, 198], [209, 95, 65, 114, 172, 172, 172, 250], - [211, 0, 30, 85, 81, 174, 36, 155], [211, 192, 0, 146, 202, 73, 184, 0], - [216, 252, 121, 0, 15, 0, 0, 0], [223, 224, 172, 135, 39, 144, 86, 21], - [227, 6, 162, 211, 0, 0, 0, 0], [231, 172, 172, 172, 172, 172, 172, 172], - [240, 131, 84, 173, 220, 253, 57, 140], [246, 169, 98, 160, 173, 49, 100, 0], + static OFFSETS: [u8; 1391] = [ + 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 42, + 5, 1, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, 39, + 14, 1, 1, 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, 10, + 3, 2, 1, 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 53, 21, 1, 18, + 12, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, 3, 4, 3, 8, + 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, + 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, 1, 5, + 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, + 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, 3, 3, 3, 12, + 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 4, 1, 8, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, 2, 1, 3, + 5, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 7, 1, 1, 4, 13, 2, 13, + 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, 24, 1, 9, 1, 1, + 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, 1, 1, 1, 19, 1, + 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 17, 6, 16, 1, 36, 67, 55, 1, 1, 2, 5, + 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, + 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, 0, 2, 17, 1, 26, + 5, 75, 3, 11, 7, 13, 1, 6, 12, 20, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, 1, 4, 1, + 67, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, 63, 2, + 20, 50, 1, 23, 2, 63, 52, 1, 15, 1, 7, 52, 42, 2, 4, 10, 44, 1, 11, 14, 55, 22, 3, 10, 36, + 2, 9, 7, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 39, 14, 11, 0, 2, 6, 2, 38, 2, 6, 2, 8, + 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, + 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 11, 2, 4, 5, + 5, 4, 1, 17, 41, 0, 52, 0, 47, 1, 47, 1, 133, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, + 16, 23, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, + 5, 4, 86, 6, 3, 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 3, 0, 67, 46, 2, 0, + 3, 16, 10, 2, 20, 47, 5, 8, 3, 113, 39, 9, 2, 103, 2, 53, 2, 9, 42, 17, 1, 33, 24, 52, 12, + 68, 1, 1, 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, 1, + 55, 9, 14, 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, + 43, 1, 14, 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, 1, + 1, 1, 2, 1, 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, 89, + 3, 6, 2, 6, 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, 29, 3, + 49, 47, 32, 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, 8, 52, + 156, 0, 9, 22, 10, 8, 152, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, + 10, 22, 10, 26, 70, 56, 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, + 27, 54, 10, 22, 10, 19, 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 0, 42, 1, 2, 3, 2, 78, 29, + 10, 1, 8, 22, 106, 21, 27, 23, 9, 70, 60, 55, 23, 25, 23, 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, + 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, 1, 65, 7, 1, 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, + 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, 2, 2, 3, 1, 6, 1, 5, 7, 156, 66, 1, 3, + 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, 3, 1, 59, 54, 2, 1, 71, + 27, 2, 14, 213, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, 1, 2, 2, 2, 2, 4, 93, 8, 2, 46, + 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 34, 57, 0, 9, 1, 45, 1, 7, 1, 1, 49, 30, 2, + 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, 1, 2, 2, 24, 6, 1, 2, 1, 37, 1, 2, 1, 4, + 1, 1, 0, 23, 185, 1, 79, 0, 102, 111, 17, 196, 0, 0, 0, 0, 0, 0, 7, 31, 113, 30, 18, 48, 16, + 4, 31, 21, 5, 19, 0, 64, 128, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 2, 14, 0, 8, 0, 42, 9, 0, + 0, 49, 3, 17, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 4, 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, + 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, + 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 7, 1, 17, 2, 7, 1, + 2, 1, 5, 213, 45, 10, 7, 16, 1, 0, 44, 0, 197, 59, 68, 3, 1, 3, 1, 0, 4, 1, 27, 1, 2, 1, 1, + 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, + 26, 6, 26, 6, 26, 0, 0, 34, 0, 11, 222, 2, 0, 14, 0, 0, 0, 0, 0, 0, ]; - static BITSET_CANONICAL: [u64; 172] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111111110, - 0b1111111111001111111111111111111111111111111111111111111111111111, - 0b1111111101111111111111111111111111111111011111111111111111111111, - 0b1111111111111111111111111111111111111111111111111000011111111111, - 0b1111111111111111111111111111111111111111111111111110000000000000, - 0b1111111111111111111111111111111111111111111111110000000000000000, - 0b1100000011111111111111111111111111111111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111110000000011, - 0b1111111111111111111111111111111100111111001111111111111111111111, - 0b1111111111111111000000111111111111111111111111110000001111111111, - 0b1111111111111111000000000000001111111111111111111111111111111111, - 0b1111111111111111000000000000000000111111111111111111111111111111, - 0b1000111111110000011111111111111111111111111111111111111111111111, - 0b0111111101111111111111111111111111111111111111111111110111111111, - 0b0000000000000000000001111111111111100111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111011011, - 0b1111111111111111111111111111111111111111111111011111110001011111, - 0b1111111111111111111111111111111111111111111110000000000000000000, - 0b1111111111111111111111111111111111111111011111111111111100111101, - 0b1111111111111111111111111111111111111111001111011111111111111111, - 0b1111111111111111111111111111111101111111011111110111111101111111, - 0b1111111111111111111111111111111100111101011111110011110111111111, - 0b1111111111111111111111111111111100111100000000001111111111111111, - 0b1111111111111111111111111111111100011111111111111111111111111111, - 0b1111111111111111111111111111111100000111111111111111111111111110, - 0b1111111111111111111111111111111100000111111111110000000000000000, - 0b1111111111111111111111111111111100000010011111111111111111111111, - 0b1111111111111111111111111111111100000000000000000111111111111111, - 0b1111111111111111111111111111111100000000000000000100001111100000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111111011100000011111111111111111111111, - 0b1111111111111111111111111111110011111111100000000000000000000000, - 0b1111111111111111111111111111101111111111111111111101011101000000, - 0b1111111111111111111111111100000000000111111111111111111111111110, - 0b1111111111111111111111011011111100000000000000000000000011001011, - 0b1111111111111111111111001111111100000000000000000000000000000000, - 0b1111111111111111110000000000000011111110111111111111111111111111, - 0b1111111111111111011111110111111100000000011111100111111001111110, - 0b1111111111111111001000001011111111111111111111111111111111111111, - 0b1111111111111111000001111111111111111111111111111111111111111111, - 0b1111111111111111000001111111111111111111111111111111110000000000, - 0b1111111111111111000000111111111111110111111111111111111111111111, - 0b1111111111111111000000111111100011111111111100000000000011111111, - 0b1111111111111111000000011111111110111111111111111011110101111111, - 0b1111111111111111000000001111000000000000000001110000000000000000, - 0b1111111111111111000000001000000000011111111111111111111111111111, - 0b1111111111111111000000000000111111111111111111111111111111111111, - 0b1111111111111110111111111111111111111111111111111111111111100000, - 0b1111111111111110000111111111111111111111111111111111111011111111, - 0b1111111111111110000000000000111111111111111000011101111111111111, - 0b1111111111111100000000000000000000000000000000000000000000000001, - 0b1111111111100000000000000000000000000000000000000000011111111100, - 0b1111111111011111111111111111111100000000000000000000000000000000, - 0b1111111111011111000000000000000000000000000000000000000000000000, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b1111110001111111111111111111111100000000000000000011111111111111, - 0b1111110000000000111100111111111111111111111111111111111111111111, - 0b1111110000000000000000000000111110000000111100000101110111011111, - 0b1111100101111111111111111111111111111111111111111111111111111111, - 0b1111011111111111111111111111111111111111111111110010000010111111, - 0b1111011111111111111111111111111111110111111111111111111111111101, - 0b1111001111111111101111010101000000111110001011111111110010000100, - 0b1110101111111111110111100110010011011111111111111111111111111111, - 0b1110100011111100000000000000000000000000000000000000000000101111, - 0b1110011111111111111111111111111111111111111111110000000111111111, - 0b1110011111111111111111111111111111111111111111011101111111111111, - 0b1110001111111111111111011111111111111111111111011101111111101111, - 0b1110001111101111111111011111111111111111111111011101111111101111, - 0b1110001111101101111111011111111111111111111110111011111111101110, - 0b1110001111101101111111011111111111111111111110011001111111101111, - 0b1110001111101101111111011111111111111111111110011001111111101110, - 0b1110001111000101111111011111111111111111111110011001111111101111, - 0b1110000011111111111111111111100000000000000000000000000000001111, - 0b1100001111111111110001110001100011010110001111011100011111101100, - 0b1100001101101101111111011111111111111111111110011000011111101110, - 0b1011111111111111000000000000000000000000000000000000000111111111, - 0b1011110011011111000000000000000000000000000000000000000000100000, - 0b1011011111111111111111110111111111111111111111111110111111111111, - 0b1011010001111111111111111111111111111111111111111111101101111111, - 0b1001110000000000111000011111111000011111111011111111111111111111, - 0b1001100110111111111111111111111111111111011011111111001001111111, - 0b1001000110111111111111111111111111111111111111111111110100111111, - 0b1000000000000010000000000000000000000000000000000000000000000000, - 0b1000000000000000000000001000000000000000000000000000000000000000, - 0b1000000000000000000000000000000011111111111111111111111111111111, - 0b0111111101111111011111110111111100000000011111111111111111111111, - 0b0111111100111111111111111111111111111111111111111111111111111111, - 0b0111111100111101111111111111111111111111111111110011110111111111, - 0b0111110000000000111111111111111100000000000000001000000000000000, - 0b0111101111111111111111111111111111011111110111111110011110111111, - 0b0101111111011111111111111111111111111111111111111111111111111111, - 0b0101111101111111111111011111111111100000111110000000000001111111, - 0b0101111011110111111101111001011010101010100101101110101010000100, - 0b0100000010011111111111111111111111111111111110111111111111111111, - 0b0011111111111111111111111111111111111100000000001110000000000000, - 0b0011111111111111111111111111111110101010111111110011111100111111, - 0b0011111110000000000111111111111111111111111111111111111111111111, - 0b0011101111111111111111111010111111111111111111111111011111010110, - 0b0010111111111011111111111111111111111100011111111111111111101110, - 0b0001111111111111111111111111111111111110111111111111111100000011, - 0b0001111111111111111111111111111100000000000001111111111111111111, - 0b0001111111111111111111111111111100000000000000000000000000000000, - 0b0001111111111111000001111111111111111111111111111111111111111111, - 0b0001111111011100000111111111111100001111110011110001111111011100, - 0b0001111100111110000000111111111000000000000000000000000011100000, - 0b0001111000000000000000000000111100000000000000010001101110111111, - 0b0001000000000011000000000000111110110000100000000101100110011111, - 0b0000111111111111111111111111111111111111000011111111111111111111, - 0b0000111111111111111110111110111000001111111111111111101111111111, - 0b0000111111111111000000000000000000000000000000000000000011111111, - 0b0000101011110111111111101001011011111111111111111111111111101111, - 0b0000011111111111111111111111111111111111111111110000011111111111, - 0b0000011111111111111111111111111000000000000000000000000000000000, - 0b0000010001101111110111100000000000000000000000000000000000000000, - 0b0000010000110000000001111111111111111111111111111111110000000000, - 0b0000010000100000000001000000000000000000000000000000000000000000, - 0b0000001111111111111111111111111100000000001111111111111111111111, - 0b0000000111111111111111111111111111111111111111111111111111111100, - 0b0000000111111111110001111111111111111111111111111111111111111111, - 0b0000000111111111000011111111111101111111111111111111111111111111, - 0b0000000100111111111111111111111111111111111111111111111111111111, - 0b0000000001111111111111111111111100000000001111111111111111111111, - 0b0000000001111111111111111111111100000000000000000000000000011111, - 0b0000000001111111111111111111111100000000000000000000000000000000, - 0b0000000001111111111111101111111111111111111111001111111111111111, - 0b0000000001000111111111111111111111111111111111110000000011110000, - 0b0000000000111111111111111111111111111110111011111111000001101111, - 0b0000000000111111000000000000000001011110000000100001100110000111, - 0b0000000000111100111111111111111100111000000000000000000000000101, - 0b0000000000110111111111111111111100000000000000000000000000000000, - 0b0000000000011111111111111111111001111111111111111111111111111111, - 0b0000000000011111001111111111111111111111111111110000000000000000, - 0b0000000000001111111111111111111100000000000011111101111111111111, - 0b0000000000001101110111111111111100000000000011111111111111111111, - 0b0000000000001100011110000001111111111111111111111111111111111111, - 0b0000000000001100000000000000000011111111010111111000000001111111, - 0b0000000000000111111111111111111100000000001111111111111111111111, - 0b0000000000000111100001111111111111111111111111110000000010110110, - 0b0000000000000110000000000000111101000000011000000001110111011111, - 0b0000000000000011111111111011111111111111111111111111111111111111, - 0b0000000000000011000110111111111111111111111111111111111111111111, - 0b0000000000000011000000000000101100000000000000000000000000000000, - 0b0000000000000010000000000000111110110000110000000001100110011111, - 0b0000000000000000100000001111111111111111111111111111111111111111, - 0b0000000000000000010100000001111100000000000000111111111111000011, - 0b0000000000000000001111111111111111111111111111110000000000000000, - 0b0000000000000000000111111111111111111100111111111111111111111111, - 0b0000000000000000000011000000000011111111111111110001111111111111, - 0b0000000000000000000001111111111100000001111111111111111111111111, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000000011111111111111111111111110000000000000000, - 0b0000000000000000000000001111111111111111111111111111111110111111, - 0b0000000000000000000000001111111100000000001111111111111111111111, - 0b0000000000000000000000000001111111111111111111111111111011111111, - 0b0000000000000000000000000001101011111100111111111111111111111111, - 0b0000000000000000000000000000111111100000100000010001100110011111, - 0b0000000000000000000000000000111100000111011000000001110111011111, - 0b0000000000000000000000000000001110000000000000000000011110111011, - 0b0000000000000000000000000000000011111111111111111000000011111111, - 0b0000000000000000000000000000000011110000000000000010000001011111, - 0b0000000000000000000000000000000001000011111111110000000111111111, - 0b0000000000000000000000000000000000100000111111111111111111111111, - 0b0000000000000000000000000000000000011100111111001111110011111100, - 0b0000000000000000000000000000000000010100000000001100000000011110, - 0b0000000000000000000000000000000000010000100000000000000111111111, - 0b0000000000000000000000000000000000000001011110110111111111111111, - 0b0000000000000000000000000000000000000000100000010001110111000111, - 0b0000000000000000000000000000000000000000001111101111111100001111, - 0b0000000000000000000000000000000000000000000000000010000001111111, - 0b0000000000000000000000000000000000000000000000000000100010001111, - 0b0000000000000000000000000000000000000000000000000000000010110011, - ]; - static BITSET_MAPPING: [(u8, u8); 83] = [ - (0, 64), (1, 64), (1, 189), (1, 188), (1, 187), (1, 186), (1, 185), (1, 183), (1, 182), - (1, 181), (1, 179), (1, 78), (1, 176), (1, 175), (1, 174), (1, 170), (1, 166), (1, 165), - (1, 163), (1, 162), (1, 161), (1, 159), (1, 156), (1, 152), (1, 151), (1, 150), (1, 149), - (1, 148), (1, 145), (1, 111), (1, 144), (1, 112), (1, 142), (1, 141), (1, 140), (1, 139), - (1, 138), (1, 137), (1, 136), (1, 135), (1, 133), (1, 132), (1, 131), (1, 130), (1, 63), - (1, 60), (1, 59), (1, 54), (1, 52), (1, 51), (1, 48), (1, 47), (1, 31), (1, 21), (1, 4), - (2, 129), (2, 58), (2, 57), (2, 50), (2, 42), (2, 28), (2, 21), (3, 180), (3, 30), (3, 24), - (3, 18), (4, 132), (4, 33), (4, 17), (5, 80), (5, 32), (6, 112), (6, 16), (7, 96), (7, 3), - (8, 38), (9, 32), (10, 17), (11, 69), (12, 32), (13, 187), (14, 179), (15, 141), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod case_ignorable { - const BITSET_LAST_CHUNK_MAP: u16 = 1792; - static BITSET_CHUNKS_MAP: [u8; 251] = [ - 36, 19, 18, 44, 41, 33, 22, 35, 31, 6, 0, 7, 49, 45, 37, 3, 40, 0, 0, 0, 0, 0, 20, 48, 34, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 0, 10, 46, 42, - 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 39, 2, 11, 0, 0, 0, 29, 9, 17, 26, 32, 24, 28, 51, 30, - 27, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 4, 21, 0, 0, 0, 23, 0, 0, 43, 13, 0, 0, 15, 0, 0, 0, 0, 1, 25, + static SHORT_OFFSET_RUNS: [u32; 32] = [ + 688, 44045149, 555751186, 559947709, 794831996, 866136069, 891330581, 916497656, 920692236, + 924908318, 1122041344, 1130430973, 1193347585, 1205931300, 1231097515, 1235294255, + 1445009723, 1453399088, 1512120051, 1575040048, 1579248368, 1583443791, 1596046493, + 1612829031, 1621219840, 1642192896, 1667359024, 1688330988, 1692526800, 1696723963, + 1705902081, 1711210992, ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 8], [0, 0, 0, 0, 0, 0, 0, 141], - [0, 0, 0, 0, 0, 0, 14, 42], [0, 0, 0, 0, 0, 27, 92, 0], [0, 0, 0, 0, 0, 133, 108, 101], - [0, 0, 0, 0, 0, 152, 0, 0], [0, 0, 0, 0, 79, 82, 47, 111], [0, 0, 0, 0, 135, 0, 5, 126], - [0, 0, 0, 0, 156, 0, 0, 0], [0, 0, 0, 17, 0, 0, 0, 0], [0, 0, 0, 136, 0, 168, 0, 0], - [0, 0, 0, 147, 0, 0, 0, 0], [0, 0, 0, 157, 0, 0, 0, 0], [0, 0, 0, 167, 9, 129, 0, 0], - [0, 0, 0, 170, 0, 161, 0, 0], [0, 0, 102, 0, 0, 0, 0, 0], [0, 0, 145, 0, 0, 171, 0, 0], - [0, 0, 169, 0, 0, 109, 12, 80], [0, 0, 174, 123, 123, 64, 176, 0], - [0, 49, 0, 153, 0, 16, 0, 23], [0, 149, 0, 0, 0, 0, 0, 0], - [2, 103, 15, 105, 54, 106, 125, 119], [4, 75, 88, 0, 0, 0, 0, 0], - [6, 110, 37, 181, 0, 0, 13, 182], [22, 123, 0, 0, 123, 123, 123, 11], - [28, 163, 50, 132, 76, 139, 7, 120], [32, 0, 18, 0, 33, 175, 118, 0], - [34, 124, 71, 0, 96, 0, 0, 0], [36, 0, 0, 144, 0, 0, 0, 0], [40, 115, 117, 0, 0, 0, 0, 0], - [41, 78, 112, 140, 0, 0, 0, 0], [44, 0, 0, 98, 54, 77, 0, 0], - [58, 74, 58, 29, 15, 104, 127, 122], [62, 0, 180, 3, 0, 0, 0, 0], - [63, 164, 53, 121, 67, 160, 52, 130], [65, 177, 68, 0, 0, 0, 0, 0], - [70, 17, 0, 66, 24, 69, 21, 1], [72, 57, 30, 73, 0, 97, 0, 94], - [87, 178, 0, 142, 46, 179, 143, 61], [89, 39, 113, 85, 0, 0, 0, 0], - [90, 151, 0, 20, 56, 84, 59, 45], [95, 0, 0, 38, 162, 172, 48, 100], - [99, 0, 0, 0, 159, 0, 0, 0], [114, 86, 0, 91, 26, 158, 10, 51], - [116, 35, 25, 124, 55, 81, 93, 83], [131, 31, 155, 146, 173, 138, 150, 148], - [134, 0, 0, 0, 0, 0, 0, 0], [137, 0, 0, 0, 0, 0, 0, 0], [154, 128, 19, 0, 60, 0, 0, 0], - [165, 0, 0, 0, 0, 0, 0, 0], [166, 0, 0, 0, 43, 128, 0, 107], + static OFFSETS: [u8; 821] = [ + 39, 1, 6, 1, 11, 1, 35, 1, 1, 1, 71, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, + 1, 1, 251, 7, 207, 1, 5, 1, 49, 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, + 1, 10, 21, 16, 1, 101, 8, 1, 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, + 24, 43, 3, 119, 48, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, 58, 1, 4, 4, 8, 1, + 20, 2, 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, + 1, 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, 61, 1, 12, 1, 50, 1, 3, + 1, 57, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, 5, 2, 20, 2, 28, 2, 57, 2, 4, + 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, 98, 1, 2, 9, 9, 1, 1, 6, 74, + 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, 25, + 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, 29, 3, 29, 2, 30, 2, 64, 2, 1, + 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 4, 52, 1, 65, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, + 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 17, 63, 4, 48, 1, 1, 5, 1, 1, 5, 1, + 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, + 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 95, 1, 5, 0, 1, 1, 3, 11, 3, 13, 3, 13, 3, 13, 2, + 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, 16, 13, 51, 33, 0, 2, 113, 3, + 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, 93, 3, 0, 1, 0, 6, 0, 1, 98, + 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 109, 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, + 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, + 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, + 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 16, 0, 16, 3, 1, 12, 16, 34, 1, 2, 1, 169, 1, + 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, 0, 1, 226, 1, 149, 5, 0, 3, 1, + 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 0, 2, 153, 11, 176, 1, 54, 15, 56, 3, 49, 4, 2, 2, 2, 1, + 15, 1, 50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 160, + 1, 3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 195, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, + 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 3, 2, 4, 1, + 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, + 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, + 72, 2, 3, 1, 1, 1, 0, 2, 0, 9, 0, 5, 59, 7, 9, 4, 0, 1, 63, 17, 64, 2, 1, 2, 0, 2, 1, 4, 0, + 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, + 1, 2, 1, 5, 0, 14, 0, 4, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, ]; - static BITSET_CANONICAL: [u64; 123] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111101111111111111111111111111111111111111111111111111111111111, - 0b1100000000000000000000000000000000000000000000000000000000010001, - 0b0111000000000000000000000000000000000000000000000000000000000000, - 0b1111100001111111111111111111111111111111111111111111111111111111, - 0b1111111111111100000000000000000000000000000000000000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000000, - 0b0111111111000000000000000000000000000000000000000000000000000011, - 0b1111100000000000000000000000000000000000000000000000000000000000, - 0b0000000001111111000000000000000000000000000000000000000000000000, - 0b0000000000000001111111111100000000000000000000000000000000000000, - 0b0000000000000000111111111111111111111111111111111111111111111111, - 0b1011111111111111111111111111111111111111111111100000000000000000, - 0b1011000000111100000000000000000000000000000000000000000000000000, - 0b1010000000000000000000000000000000000000000000000000000000000000, - 0b1001000000000000000000000000000000000000000000000000000000000010, - 0b1000000000000000100000000000000000000000000000000000000000000000, - 0b0011111100000000000000000000000000000000000000000000000000000000, - 0b0000000001101101111111001111111111111111111111000000000000000000, - 0b0000000000000000000000100000000000000000000000000000000001100000, - 0b1111111111111111111111111111111111111111111110000000000000000000, - 0b1111111111111111111111111111111111111000000000000000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000010, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111100000000000000000000000000000000000000000000, - 0b1111111111111111000000001000000000000000000000000000000000000000, - 0b1111111111111111000000000000000000000000000000101000000000000000, - 0b1111111111111000000000111000000000000000000000000000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000010, - 0b1111110000000000000000000000110000000000000000000010000110111110, - 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1011111111110111100000000000000000000000000000000000000000000000, - 0b1011111101111111000000000000000000000000000000000000000000000000, - 0b1011010001111110000000000000000000000000000000000000000000000000, - 0b1010011111111000000000000000000000000000000000000000000000000000, - 0b1001111111111000000111111110010101111111010000000000000000000000, - 0b1000011100000000000000000000000000000000000000001111000001101110, - 0b1000010111111000000000000000000000000000000000000000000000000000, - 0b1000000000000011111111111111111100000000000000000000000000110000, - 0b1000000000000010111111111101111100000000000000000000000000000000, - 0b0111100111111000000000000000000000000000000000000000011111111110, - 0b0110011011111101111000000000000000000000000000000000000000000000, - 0b0110000000000000111000000000000011100000000000001110000000000011, - 0b0101100000000000000000000000000000000000000000000000000000000000, - 0b0100000011010011100000000000000000000000000000000000000000000000, - 0b0100000000000000000000000000110000000000000000000010000000011110, - 0b0100000000000000000000000000000000000100000000000100000010000000, - 0b0011111110110000000000000000000000000000000000000000000000000000, - 0b0011001111001000000000000000000000000000000000000000000000000111, - 0b0011000000000000000000000000000000000000000000000000000000000000, - 0b0010011001111000000000000000000000000000000000000000000000000011, - 0b0010010000111111111110000000000000000000000000000000000000000000, - 0b0001111111111111111111111111111111111110111111111110000011011111, - 0b0001111111110010000000000000000000000000000000000000000000000000, - 0b0001100000000000000000000000000000000000000000000000000000000011, - 0b0001011111010000000000000000000000000000000000000000000000001111, - 0b0001010000000000000000000000000000000000000000000000000000000111, - 0b0001000000000001000000000000000000000000000000000001000000001000, - 0b0001000000000000000000000000000000000000000000000000000000000110, - 0b0001000000000000000000000000000000000000000000000000000000000010, - 0b0000111000000100000000011000011100000000000000000000000000000000, - 0b0000111000000000000000000000100000000000000000000000000000000000, - 0b0000100000111110001111000000000000000000000000000000000000100000, - 0b0000011111110010000000000000000000000000000000000000000000000000, - 0b0000010000110000111111111111111111111111111111111111111111111111, - 0b0000010000000000010000001000000000000000000000000000000000000000, - 0b0000001100010000001000011111110111111111111101110000000000000000, - 0b0000001010100000000000000000000000000011000000000000000000000000, - 0b0000000110010000101000010000000000000000000000000000000000000000, - 0b0000000100000000000001111111111111111111111111111111111111111111, - 0b0000000011001111111100000000000000000000000000000000000000000000, - 0b0000000010111111001010000000000000000000000000000000000000000000, - 0b0000000001100110011111100000000000000000000000000000000000000000, - 0b0000000001011000001100000000000000100000000000000000000000000010, - 0b0000000000100011000000000000000000000000000000100011100110000110, - 0b0000000000100000000111111111111111111111111111111111111111111111, - 0b0000000000011111111011111000000000000000000000000000000000000111, - 0b0000000000011111000111111100000000000000000000000000000000000001, - 0b0000000000011110000000000000000111000011000000000000000000000000, - 0b0000000000011100000000000000000000000000000111000000000000000000, - 0b0000000000010000000000000000000000000000000000000000000010110110, - 0b0000000000001111111110000000000000000000000000000000000000000100, - 0b0000000000001100000000000000000000000000000011000000000000000000, - 0b0000000000000011101000110100000000000000000000000000000000000000, - 0b0000000000000010000000000000110000000000111111100010000111111110, - 0b0000000000000001111111111111111111111111111111110000000000000000, - 0b0000000000000001000000000000000011111111111111111111100000000001, - 0b0000000000000000111111111111111100000000000010001111111111111111, - 0b0000000000000000111111111111111011111000000000000000000000010000, - 0b0000000000000000011111001001000000000011000000001111100000000000, - 0b0000000000000000001111111111111111111111110000000000000000000000, - 0b0000000000000000001111011111111110111111110000000000000000000000, - 0b0000000000000000001111000000000000000000000000000000111111100111, - 0b0000000000000000001110110011110000000000000000000000000000000011, - 0b0000000000000000001000010010000000000000000000000000000000000000, - 0b0000000000000000000100000110000000000000000000000000100001000100, - 0b0000000000000000000011111011110011100000000000000000000000000000, - 0b0000000000000000000011100000000011111000000000000000000000000000, - 0b0000000000000000000001111111100010000000000000000000000000000000, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000000000110000000000000000000001000000000000000, - 0b0000000000000000000000000001101100000000000000000000000000000000, - 0b0000000000000000000000000000111101100000000000000000000000000000, - 0b0000000000000000000000000000110000000000011000000011110111000001, - 0b0000000000000000000000000000110000000000011000000010000000011110, - 0b0000000000000000000000000000110000000000000000000011000001000000, - 0b0000000000000000000000000000110000000000000000000010000000011110, - 0b0000000000000000000000000000000100001100111100000000000000000000, - 0b0000000000000000000000000000000011111111111111111000000000000000, - 0b0000000000000000000000000000000010000010000000000000000000000000, - 0b0000000000000000000000000000000001000000000000000000000001011100, - 0b0000000000000000000000000000000000100000100011111111111001000000, - 0b0000000000000000000000000000000000100000000000000010000001100100, - 0b0000000000000000000000000000000000011111111111110000000000000000, - 0b0000000000000000000000000000000000010111111111110000000000111111, - 0b0000000000000000000000000000000000001110011111100000000010000000, - 0b0000000000000000000000000000000000001001100000000000000000000000, - 0b0000000000000000000000000000000000000011011111111111110000000000, - 0b0000000000000000000000000000000000000000101000110000000000000000, - 0b0000000000000000000000000000000000000000010111000000010000000000, - 0b0000000000000000000000000000000000000000000000001001111000000000, - 0b0000000000000000000000000000000000000000000000000011111101000000, - 0b0000000000000000000000000000000000000000000000000010000000000001, - ]; - static BITSET_MAPPING: [(u8, u8); 60] = [ - (0, 64), (1, 70), (1, 71), (1, 190), (1, 72), (1, 73), (1, 188), (1, 76), (1, 82), (1, 83), - (1, 85), (1, 91), (1, 100), (1, 102), (1, 117), (1, 118), (1, 121), (1, 66), (1, 67), - (1, 69), (2, 160), (2, 153), (2, 147), (2, 142), (2, 139), (2, 134), (3, 6), (3, 12), - (3, 29), (3, 33), (3, 51), (4, 84), (4, 101), (4, 109), (4, 117), (5, 181), (5, 62), - (5, 63), (6, 12), (6, 46), (6, 7), (7, 176), (7, 134), (7, 57), (8, 53), (8, 59), (9, 19), - (9, 32), (10, 32), (10, 33), (11, 142), (11, 64), (12, 184), (13, 184), (14, 33), (15, 170), - (16, 1), (17, 33), (18, 179), (19, 23), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod cased { - const BITSET_LAST_CHUNK_MAP: u16 = 124; - static BITSET_CHUNKS_MAP: [u8; 124] = [ - 13, 15, 0, 0, 8, 0, 0, 11, 14, 10, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 3, 2, 0, 16, 0, 12, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, - 0, 0, 0, 7, 6, + static SHORT_OFFSET_RUNS: [u32; 19] = [ + 4256, 115348384, 136322176, 144711446, 163587254, 320875520, 325101120, 358656816, + 392231680, 404815649, 413205504, 421596288, 434182304, 442592832, 446813184, 451008166, + 528607488, 576844080, 582152586, ]; - static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 15, 39, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 59, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 47, 44, 0, 13, 39, 8, 26], - [0, 0, 0, 0, 16, 60, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 39, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 12, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 61], - [0, 0, 12, 45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 17, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 36, 0, 39, 39, 39, 0, 39, 39, 39, 39, 4, 22, 21, 23], - [0, 0, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 3, 27, 2, 39, 39, 49, 6, 39, 39, 28, 31, 0, 35, 14, 50], - [0, 34, 32, 0, 19, 11, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [39, 39, 7, 39, 51, 10, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [39, 43, 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [39, 54, 38, 1, 20, 9, 39, 39, 39, 39, 5, 18, 56, 57, 58, 55], - [52, 53, 39, 29, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + static OFFSETS: [u8; 283] = [ + 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, + 96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, + 41, 0, 38, 1, 1, 5, 1, 2, 43, 2, 3, 0, 86, 2, 6, 0, 9, 7, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, + 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, + 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, + 1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 47, 1, 47, 1, 133, 6, 4, 3, 2, 12, 38, + 1, 1, 5, 1, 0, 46, 18, 30, 132, 102, 3, 4, 1, 48, 2, 9, 42, 2, 1, 3, 0, 43, 1, 13, 7, 80, 0, + 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 0, 51, 13, 51, 0, 64, 0, 64, 0, 85, 1, 71, 1, + 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, + 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 68, + 0, 26, 6, 26, 6, 26, 0, ]; - static BITSET_CANONICAL: [u64; 39] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111111111111101111, - 0b1111111101111111111111111111111111111111011111111111111111111111, - 0b0000011111111111111111111111111000000111111111111111111111111110, - 0b1111111111111111111111111111111100111111001111111111111111111111, - 0b1111111111111111111111110011111111111111111111111111111111111111, - 0b1111111111111111111111111111111111111111111111111111111111110000, - 0b1111111111111111111111111111111111111111111111111111110000000011, - 0b1111111111111111111111111111111111111111111111110111100011111111, - 0b1111111111111111111111111111111111111111111111011111110001011111, - 0b1111111111111111111111111111111100000000011111111111111111111111, - 0b1111111111111111111111111111111100000000000000000100001111100000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111110000000000000000000000000000000000, - 0b1111111111111111111111111111101111111111111111111101011101000000, - 0b1111111111111111000000011111111111110111111111111111111111111111, - 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b1111011111111111111111111111111111110111111111111111111111111101, - 0b1111001000011111101111010101000000111110001011111111110010000100, - 0b0111101111111111111111111111111111011111110111111110011110111111, - 0b0101111111011111111111111111111111111111111111111111111111111111, - 0b0011111111111111111111111111111110101010111111110011111100111111, - 0b0001111111011100000111111111111100001111110011110001111111011100, - 0b0000111111111111111111111111111111111111000011111111111111111111, - 0b0000011111111111111111111111111000000000000000000000000000000000, - 0b0000011101100000000000000000000000000000000000000000011111111100, - 0b0000010000100000000001000000000000000000000000000000000000000000, - 0b0000000111111111111111111111111111111111111011111111111111111111, - 0b0000000000001100011110000001111111111111111111111111111111111111, - 0b0000000000000000001000001011111111111111111111111111111111111111, - 0b0000000000000000000000000001111100000000000000000000000000000011, - 0b0000000000000000000000000000000000011111111111110000000000000000, - 0b0000000000000000000000000000000000000000111110000000000001111111, - 0b1000000000000010000000000000000000000000000000000000000000000000, - 0b1011110011001111000000000000000000000000000000000000000000100000, - 0b1110011111111111111111111111111111111111111111110000000111111111, - 0b1110011111111111111111111111111111111111111111110010000010111111, - 0b1110101111111111110111100110010011011111111111111111111111111111, - ]; - static BITSET_MAPPING: [(u8, u8); 24] = [ - (0, 64), (1, 188), (1, 183), (1, 182), (1, 176), (1, 162), (1, 160), (1, 150), (1, 146), - (1, 141), (1, 55), (1, 50), (1, 44), (1, 43), (1, 27), (1, 17), (2, 180), (2, 30), (2, 24), - (2, 18), (3, 160), (3, 15), (4, 32), (5, 93), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod cc { - const BITSET_LAST_CHUNK_MAP: u16 = 2; - static BITSET_CHUNKS_MAP: [u8; 3] = [ - 0, 1, 0, + static SHORT_OFFSET_RUNS: [u32; 1] = [ + 1114272, ]; - static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [ - [0], [1], [2], + static OFFSETS: [u8; 5] = [ + 0, 32, 95, 33, 0, ]; - static BITSET_CANONICAL: [u64; 2] = [ - 0b0000000000000000000000000000000011111111111111111111111111111111, - 0b1000000000000000000000000000000000000000000000000000000000000000, - ]; - static BITSET_MAPPING: [(u8, u8); 1] = [ - (0, 160), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } #[rustfmt::skip] pub mod grapheme_extend { - const BITSET_LAST_CHUNK_MAP: u16 = 1792; - static BITSET_CHUNKS_MAP: [u8; 246] = [ - 0, 6, 17, 44, 37, 31, 20, 32, 29, 4, 0, 5, 43, 40, 33, 0, 41, 0, 0, 0, 0, 0, 9, 0, 36, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 38, 34, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 7, 0, 35, 1, 10, 0, 0, 0, 27, 8, 16, 24, 30, 42, 26, 22, 28, 25, 11, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 14, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 19, 0, 0, - 0, 21, 0, 0, 39, 12, 0, 0, 13, 23, + static SHORT_OFFSET_RUNS: [u32; 31] = [ + 768, 2098307, 6292881, 10490717, 513808146, 518004748, 723528943, 731918378, 744531567, + 752920578, 769719070, 899743232, 903937950, 912327165, 916523521, 929107236, 954273451, + 958470191, 1180769328, 1252073203, 1315007216, 1319202639, 1327611037, 1340199269, + 1344395776, 1373757440, 1398923568, 1419895532, 1424091344, 1429078048, 1438581232, ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 135], [0, 0, 0, 0, 0, 18, 75, 0], - [0, 0, 0, 0, 0, 106, 129, 109], [0, 0, 0, 0, 0, 124, 0, 0], [0, 0, 0, 0, 3, 65, 40, 90], - [0, 0, 0, 0, 99, 139, 0, 0], [0, 0, 0, 0, 107, 0, 0, 0], [0, 0, 0, 0, 130, 0, 0, 0], - [0, 0, 0, 67, 0, 136, 0, 14], [0, 0, 0, 108, 0, 12, 0, 0], [0, 0, 0, 120, 0, 0, 0, 0], - [0, 0, 0, 131, 0, 0, 0, 0], [0, 0, 0, 134, 0, 133, 0, 0], [0, 0, 0, 142, 6, 0, 0, 0], - [0, 0, 115, 0, 0, 0, 0, 0], [0, 0, 118, 0, 0, 126, 0, 0], [0, 0, 132, 0, 0, 0, 11, 98], - [0, 39, 116, 119, 0, 0, 0, 0], [0, 121, 0, 0, 0, 0, 0, 0], [2, 85, 27, 86, 34, 84, 101, 88], - [5, 60, 71, 0, 0, 0, 0, 0], [8, 0, 0, 0, 33, 102, 0, 87], [14, 99, 0, 0, 99, 99, 99, 139], - [17, 1, 51, 0, 61, 112, 138, 97], [21, 0, 56, 0, 22, 141, 95, 0], - [24, 100, 55, 0, 79, 0, 0, 0], [28, 0, 0, 117, 0, 0, 0, 0], [31, 92, 94, 0, 0, 0, 0, 0], - [32, 63, 91, 0, 0, 0, 0, 0], [36, 0, 0, 80, 34, 62, 0, 0], [48, 59, 48, 10, 19, 83, 38, 96], - [50, 105, 44, 140, 53, 30, 43, 103], [54, 0, 0, 52, 0, 0, 0, 7], - [57, 47, 20, 58, 0, 0, 0, 77], [70, 0, 0, 0, 0, 0, 116, 0], [72, 0, 114, 0, 0, 0, 0, 0], - [73, 123, 0, 15, 46, 82, 35, 37], [78, 0, 0, 29, 111, 127, 41, 110], - [81, 0, 0, 0, 6, 0, 0, 0], [93, 26, 9, 100, 45, 64, 76, 66], [104, 0, 0, 68, 0, 0, 0, 0], - [113, 89, 25, 137, 0, 0, 23, 143], [122, 0, 13, 0, 49, 0, 0, 0], - [128, 69, 0, 74, 16, 125, 4, 42], + static OFFSETS: [u8; 689] = [ + 0, 112, 0, 7, 0, 45, 1, 1, 1, 2, 1, 2, 1, 1, 72, 11, 48, 21, 16, 1, 101, 7, 2, 6, 2, 2, 1, + 4, 35, 1, 30, 27, 91, 11, 58, 9, 9, 1, 24, 4, 1, 9, 1, 3, 1, 5, 43, 3, 119, 15, 1, 32, 55, + 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 29, 1, 58, 1, 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 26, 1, 2, 2, + 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, 1, + 1, 58, 1, 1, 2, 1, 4, 8, 1, 7, 3, 10, 2, 30, 1, 59, 1, 1, 1, 12, 1, 9, 1, 40, 1, 3, 1, 57, + 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 2, 1, 3, 1, 5, 2, 7, 2, 11, 2, 28, 2, 57, 2, + 1, 1, 2, 4, 8, 1, 9, 1, 10, 2, 29, 1, 72, 1, 4, 1, 2, 3, 1, 1, 8, 1, 81, 1, 2, 7, 12, 8, 98, + 1, 2, 9, 11, 6, 74, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, + 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 0, 3, 0, 3, 29, 3, 29, 2, 30, + 2, 64, 2, 1, 7, 8, 1, 2, 11, 9, 1, 45, 3, 119, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, + 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 48, 17, 63, 4, 48, 7, 1, 1, 5, 1, 40, 9, + 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 152, 3, 1, 13, 1, 7, 4, 1, + 6, 1, 3, 2, 198, 58, 1, 5, 0, 1, 195, 33, 0, 3, 141, 1, 96, 32, 0, 6, 105, 2, 0, 4, 1, 10, + 32, 2, 80, 2, 0, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, + 48, 1, 2, 4, 2, 2, 39, 1, 67, 6, 2, 2, 2, 2, 12, 1, 8, 1, 47, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, + 1, 42, 2, 8, 1, 238, 1, 2, 1, 4, 1, 0, 1, 0, 16, 16, 16, 0, 2, 0, 1, 226, 1, 149, 5, 0, 3, + 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 0, 2, 153, 11, 176, 1, 54, 15, 56, 3, 49, 4, 2, 2, + 69, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 160, 1, 3, 8, + 21, 2, 57, 2, 1, 1, 1, 1, 22, 1, 14, 7, 3, 5, 195, 8, 2, 3, 1, 1, 23, 1, 81, 1, 2, 6, 1, 1, + 2, 1, 1, 2, 1, 2, 235, 1, 2, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, + 1, 101, 3, 2, 4, 1, 5, 0, 9, 1, 2, 245, 1, 10, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, + 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, + 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 0, 5, 59, 7, 0, 1, 63, 4, 81, 1, 0, + 2, 0, 1, 1, 3, 4, 5, 8, 8, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, + 7, 1, 17, 2, 7, 1, 2, 1, 5, 0, 7, 0, 4, 0, 7, 109, 7, 0, 96, 128, 240, 0, ]; - static BITSET_CANONICAL: [u64; 99] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1000000000000000000000000000000000000000000000000000000001111111, - 0b1100000000000000000000000000000000000000000000000000000000010001, - 0b0000000000011100000000000000000000000000000111000000000000000000, - 0b0000000000000001111111111100000000000000000000000000000000000000, - 0b1111100001111111111111111111111111111111111111111111111111111111, - 0b0000000001111111000000000000000000000000000000000000000000000000, - 0b1111101111111111111111111111111111111111111111111111111111111111, - 0b0000011011111111100000000000000000000000000000000000000000000000, - 0b1111111111111111000000000000000000000000000000000000000000000000, - 0b1111110000000000000000000000110000000000000000000010000110111110, - 0b1011111111111111111111111111111111111111111111100000000000000000, - 0b0000011111000000000000000000000000000000000000000000000000000000, - 0b0000000000000000000000100000000000000000000000000000000001100000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111111111101111111111111110000000000000000000, - 0b1111111111111111000000000000000000000000000000100000000000000000, - 0b1111111100000000000000000000000000000000000000000000000000000010, - 0b1111100000000111110000111010000000000000000000000000000000000000, - 0b1101000000000000000000000000000000000000000000000000000000000010, - 0b1100000110011101000000000000000000000000000000000000000000000000, - 0b1011111101111111000000000000000000000000000000000000000000000000, - 0b1011010001111110000000000000000000000000000000000000000000000000, - 0b1011000000111100100000000000000000000000000000000000000000000000, - 0b1010011111111000000000000000000000000000000000000000000000000000, - 0b1010010111111001000000000000000000000000000000000000000000000000, - 0b1001111111111000000111111110010101111111010000000000000000000000, - 0b1001000000000000000000000000000000000000000000000000000000000010, - 0b1000011100000000000000000000000000000000000000001111000001101110, - 0b1000000000000011111111111111111100000000000000000000000000110000, - 0b0111111111111110000000000000000000000000000000000000000000000000, - 0b0111100111111000000000000000000000000000000000000000011111111110, - 0b0110011011111101111000000000000000000000000000000000000000000000, - 0b0101100000000001000000000000000000000000000000000000000000000000, - 0b0101100000000000000000000000000000000000000000000000000000000011, - 0b0101000000000000000000000000000000000000000000000000000000000010, - 0b0100000011010011100000000000000000000000000000000000000000000000, - 0b0100000000000000000000000000110000000000100000000010000000011110, - 0b0100000000000000000000000000000000000000000000000000000000000100, - 0b0011111111110111100000000000000000000000000000000000000000000000, - 0b0011111110110000000000000000000000000000000000000000000000000000, - 0b0011001111001000000000000000000000000000000000000000000000000111, - 0b0010000000001111111110000000000000000000000000000000000000000000, - 0b0001111111111111111111111111111111111110111111111110000011011111, - 0b0001111111110010000000000000000000000000000000000000000000000000, - 0b0001011111110000000000000000000000000000000000000000000000001111, - 0b0001010000000000000000000000000000000000000000000000000000000111, - 0b0001000000000000000000000000000000000000000000000001000000001000, - 0b0001000000000000000000000000000000000000000000000000000000000110, - 0b0000111000000100000000011000011100000000000000000000000000000000, - 0b0000011111110010000000000000000000000000000000000000000000000000, - 0b0000011001111000000000000000000000000000000000000000000000000011, - 0b0000001100010000001000011111110111111111111101110000000000000000, - 0b0000001010100000000000000000000000000011000000000000000000000000, - 0b0000000011001111111100000000000000000000000000000000000000000000, - 0b0000000010111111001010000000000000000000000000000000000000000000, - 0b0000000001101101111111001111111111111111111111000000000000000000, - 0b0000000001100110011111100000000000000000000000000000000000000000, - 0b0000000001000000001100000000000000000000000000000000000000000010, - 0b0000000000100011000000000000000000000000000000100011100110000110, - 0b0000000000100000000111111111111111111111111111111111111111111111, - 0b0000000000011111111011111000000000000000000000000000000000000111, - 0b0000000000011111000111111100000000000000100000000000000000000001, - 0b0000000000011110000000000000000111000011000000000000000000000000, - 0b0000000000001111111110000000000000000000000000000000000000000100, - 0b0000000000001100000000000000000000000000000011000000000000000000, - 0b0000000000000011101000110100000000000000000000000000000000000000, - 0b0000000000000011100000000000000000000000000000000000000000000000, - 0b0000000000000001111111111111111111111111111111110000000000000000, - 0b0000000000000001000000000000000011111111111111111111100000000000, - 0b0000000000000000111111111111111100000000000000001111111111111111, - 0b0000000000000000111111111111111011111000000000000000000000010000, - 0b0000000000000000111111000000000000000000000000000000000000000000, - 0b0000000000000000001111101110111111111011110000000000000000000000, - 0b0000000000000000001111011001111110011111110000000000000000000000, - 0b0000000000000000001111000000000000000000000000000000111111100111, - 0b0000000000000000001110110011110000000000000000000000000000000011, - 0b0000000000000000001000010010000000000000000000000000000000000000, - 0b0000000000000000000100000110000000000000000000000000100001000100, - 0b0000000000000000000011111011110011100000000000000000000000000000, - 0b0000000000000000000001111111100010000000000000000000000000000000, - 0b0000000000000000000001111101101111111001111111111111111101111111, - 0b0000000000000000000000000000110000000000111111100010000111111110, - 0b0000000000000000000000000000110000000000111000000010000000011110, - 0b0000000000000000000000000000110000000000100000000010000000011110, - 0b0000000000000000000000000000110000000000011000000011110111000001, - 0b0000000000000000000000000000110000000000011000000011000001000100, - 0b0000000000000000000000000000000100001100111100000000000000000000, - 0b0000000000000000000000000000000010000000010111001000010000000000, - 0b0000000000000000000000000000000001000000000000000000000001011100, - 0b0000000000000000000000000000000000100000000011111111111001000000, - 0b0000000000000000000000000000000000100000000000000010000001100100, - 0b0000000000000000000000000000000000001110011111100000000010000000, - 0b0000000000000000000000000000000000001001100000000000000000000000, - 0b0000000000000000000000000000000000000011011111111111110000000000, - 0b0000000000000000000000000000000000000000101000110000000000000000, - 0b0000000000000000000000000000000000000000100000000010000000000001, - 0b0000000000000000000000000000000000000000000000001001111000000000, - 0b0000000000000000000000000000000000000000000000000000000010110110, - ]; - static BITSET_MAPPING: [(u8, u8); 45] = [ - (0, 64), (1, 191), (1, 190), (1, 188), (1, 185), (1, 179), (1, 8), (1, 176), (1, 161), - (1, 159), (1, 155), (1, 154), (1, 39), (1, 140), (1, 57), (2, 165), (2, 161), (2, 160), - (2, 153), (2, 147), (2, 142), (2, 139), (3, 176), (3, 167), (3, 153), (3, 149), (4, 26), - (4, 32), (4, 33), (4, 42), (5, 88), (5, 109), (5, 117), (6, 19), (6, 20), (6, 32), (7, 67), - (7, 69), (8, 183), (8, 7), (9, 64), (10, 178), (11, 184), (12, 58), (13, 23), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } @@ -858,7 +405,7 @@ pub mod lowercase { ]; pub fn lookup(c: char) -> bool { - super::range_search( + super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, @@ -871,97 +418,31 @@ pub fn lookup(c: char) -> bool { #[rustfmt::skip] pub mod n { - const BITSET_LAST_CHUNK_MAP: u16 = 253; - static BITSET_CHUNKS_MAP: [u8; 250] = [ - 45, 0, 0, 37, 7, 38, 26, 35, 31, 5, 0, 12, 42, 21, 39, 0, 28, 0, 22, 4, 0, 0, 13, 0, 40, - 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 46, - 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 8, 14, 20, 0, 24, 27, 11, 25, 29, 15, 34, 33, 17, 0, - 30, 2, 0, 0, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 16, 0, 1, 0, 0, 0, 0, 6, 15, 0, 0, 18, 0, 23, 0, 9, 3, + static SHORT_OFFSET_RUNS: [u32; 38] = [ + 1632, 18876774, 31461440, 102765417, 111154926, 115349830, 132128880, 165684320, 186656630, + 195046653, 199241735, 203436434, 216049184, 241215536, 249605104, 274792208, 278987015, + 283181793, 295766104, 320933114, 383848032, 392238160, 434181712, 442570976, 455154768, + 463544256, 476128256, 480340576, 484535936, 497144544, 501340110, 509731136, 513925872, + 518121671, 522316913, 530706688, 551681008, 556989434, ]; - static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [ - [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 13], [0, 0, 0, 0, 0, 0, 0, 49], - [0, 0, 0, 0, 0, 0, 0, 59], [0, 0, 0, 0, 0, 2, 48, 0], [0, 0, 0, 0, 0, 8, 0, 0], - [0, 0, 0, 0, 0, 45, 0, 0], [0, 0, 0, 0, 0, 58, 0, 30], [0, 0, 0, 0, 10, 1, 60, 0], - [0, 0, 0, 0, 47, 0, 0, 0], [0, 0, 0, 0, 55, 0, 0, 0], [0, 0, 0, 4, 59, 0, 0, 0], - [0, 0, 0, 11, 0, 0, 0, 9], [0, 0, 0, 24, 0, 0, 0, 0], [0, 0, 0, 25, 6, 41, 0, 64], - [0, 0, 0, 59, 0, 0, 0, 0], [0, 0, 0, 69, 0, 68, 0, 0], [0, 0, 0, 71, 0, 55, 0, 0], - [0, 0, 0, 73, 0, 55, 0, 0], [0, 0, 50, 0, 0, 0, 0, 0], [0, 0, 57, 0, 0, 0, 0, 0], - [0, 0, 70, 0, 0, 55, 59, 0], [0, 15, 51, 16, 0, 0, 0, 0], [0, 17, 34, 0, 23, 0, 0, 0], - [0, 18, 12, 5, 62, 0, 61, 3], [0, 19, 0, 0, 56, 66, 0, 46], [0, 20, 0, 58, 0, 31, 0, 58], - [0, 22, 72, 65, 0, 43, 53, 0], [0, 29, 45, 0, 0, 14, 42, 0], [0, 36, 0, 59, 2, 0, 0, 33], - [0, 37, 0, 0, 0, 55, 57, 0], [0, 45, 55, 0, 0, 0, 0, 0], [0, 55, 0, 0, 0, 0, 0, 59], - [0, 55, 0, 45, 26, 0, 0, 0], [0, 55, 0, 55, 0, 0, 0, 0], [0, 55, 0, 55, 69, 0, 0, 0], - [0, 57, 0, 0, 0, 38, 0, 0], [0, 57, 0, 59, 0, 0, 0, 45], [0, 58, 0, 58, 0, 32, 0, 35], - [0, 70, 0, 0, 0, 0, 0, 0], [27, 0, 0, 0, 0, 0, 67, 0], [44, 52, 0, 0, 0, 0, 0, 0], - [55, 0, 0, 0, 0, 54, 0, 40], [57, 0, 0, 58, 0, 0, 0, 0], [57, 39, 7, 0, 0, 0, 0, 0], - [59, 0, 21, 0, 0, 0, 0, 0], [63, 0, 0, 55, 45, 0, 0, 28], + static OFFSETS: [u8; 267] = [ + 48, 10, 120, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, + 10, 118, 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, + 70, 20, 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, + 182, 10, 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, + 1, 0, 1, 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, + 154, 10, 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, + 29, 1, 8, 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, + 52, 2, 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 0, 31, 158, 10, 42, 4, 112, 7, 134, + 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 102, 12, 0, + 19, 93, 10, 0, 29, 227, 10, 70, 10, 0, 21, 0, 111, 0, 10, 230, 10, 1, 7, 0, 23, 0, 20, 108, + 25, 0, 50, 0, 10, 0, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, ]; - static BITSET_CANONICAL: [u64; 44] = [ - 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b0000000111111111111111111111111111111111111111111111111111111111, - 0b1111111111000000000000000000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111001111111111111111, - 0b1111110000000000000000000000000000000000000000000000000000000000, - 0b1111100000000000000000000000000000000000000000000000000000000000, - 0b0000000000000000000000000000111100000000000000000000000000000000, - 0b1111111111111110000000000000000000000000000000000000001111111111, - 0b0001111111111111111111100000000000000000000000000000000000000000, - 0b0000001111111111000000111111111100000000000000000000000000000000, - 0b0000000000001111111111111111111111111111111111111111111110000000, - 0b0000000000000001110000000000000000000000000000000000000000000000, - 0b0000000000000000111111111000000000000000000000000000000000000000, - 0b1111111111111111111111111111111111111111111111111100000000000000, - 0b1111111111111111111111111111111111111111111111110000000000000000, - 0b1111111111111111111111111111111100000000000000000000000000000000, - 0b1111111111111111111111000000000000000000000000000000000000000000, - 0b1111111111111110000000000000000000000000000000000000000000000000, - 0b1111111000000000000000000000000011111111000000000000000000000000, - 0b0111111111111111111111111111111100000000000000000000000000000000, - 0b0111111100000000111111111100000000000000000000000000000000000000, - 0b0111001000001100000000000000000000000000000000000000000000000000, - 0b0110000000000000000000000000000000000000000000000000000111111111, - 0b0011111111111111101111111111111111111111111111111111111111111110, - 0b0010000000000000000000000000000000000000000000000000000000000000, - 0b0000111111111111111111111111111000000000000000000000000000000000, - 0b0000111111111111000000000000000000000000000000000000000000000000, - 0b0000011100000000000000111111111000000000000000000000000010000000, - 0b0000001111111111000000000000000000000011111111110000000000000000, - 0b0000001111110001000000000000000000000000000000000000000000000000, - 0b0000001111110000111111111100000000000000000000000000000000000000, - 0b0000000111111111111111111100000001111111000000000000000000000000, - 0b0000000011111100111111111100000000000000000000000000000000000000, - 0b0000000000011111111111111111111000000011111111110000000000000000, - 0b0000000000011110111011111111111111111111111111111111111111111111, - 0b0000000000000111111111111100000000000000000000000000000000000000, - 0b0000000000000000111111111111111111111111111111000000000000000000, - 0b0000000000000000000111111111111111111111111111110000000000000000, - 0b0000000000000000000000000000001111111011111111110000000000000000, - 0b0000000000000000000000000000000011111111111111101111111100000000, - 0b0000000000000000000000000000000000000111111111110000000000000000, - 0b0000000000000000000000000000000000000000000000000000010000000010, - 0b0000000000000000000000000000000000000000000000000000001111100111, - 0b1111111100000000000000000000000011111111000000000000000000000000, - ]; - static BITSET_MAPPING: [(u8, u8); 30] = [ - (0, 64), (1, 175), (1, 76), (1, 172), (1, 165), (1, 164), (1, 162), (1, 157), (1, 138), - (1, 112), (2, 16), (2, 26), (2, 39), (2, 42), (2, 48), (2, 58), (3, 122), (3, 108), (4, 28), - (4, 54), (5, 22), (5, 48), (6, 49), (6, 50), (7, 47), (8, 55), (9, 32), (10, 108), (11, 47), - (12, 32), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } @@ -1045,7 +526,7 @@ pub mod uppercase { ]; pub fn lookup(c: char) -> bool { - super::range_search( + super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, BITSET_LAST_CHUNK_MAP, @@ -1058,31 +539,17 @@ pub fn lookup(c: char) -> bool { #[rustfmt::skip] pub mod white_space { - const BITSET_LAST_CHUNK_MAP: u16 = 32; - static BITSET_CHUNKS_MAP: [u8; 23] = [ - 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, + static SHORT_OFFSET_RUNS: [u32; 4] = [ + 5760, 18882560, 23080960, 40972289, ]; - static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [ - [1, 4, 2, 4, 4, 4], [4, 4, 0, 3, 4, 4], [4, 4, 4, 4, 4, 4], [5, 4, 4, 4, 4, 4], + static OFFSETS: [u8; 21] = [ + 9, 5, 18, 1, 100, 1, 26, 1, 0, 1, 0, 11, 29, 2, 5, 1, 47, 1, 0, 1, 0, ]; - static BITSET_CANONICAL: [u64; 4] = [ - 0b0000000000000000100000110000000000000000000000000000011111111111, - 0b0000000000000000000000000000000100000000000000000011111000000000, - 0b0000000000000000000000000000000100000000000000000000000000100000, - 0b0000000000000000000000000000000010000000000000000000000000000000, - ]; - static BITSET_MAPPING: [(u8, u8); 2] = [ - (0, 176), (0, 175), - ]; - pub fn lookup(c: char) -> bool { - super::range_search( + super::skip_search( c as u32, - &BITSET_CHUNKS_MAP, - BITSET_LAST_CHUNK_MAP, - &BITSET_INDEX_CHUNKS, - &BITSET_CANONICAL, - &BITSET_MAPPING, + &SHORT_OFFSET_RUNS, + &OFFSETS, ) } } diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 04c72116e5f..053ed825018 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -4,6 +4,7 @@ mod case_mapping; mod raw_emitter; +mod skiplist; mod unicode_download; use raw_emitter::{emit_codepoints, RawEmitter}; @@ -172,13 +173,14 @@ fn main() { modules.push((property.to_lowercase().to_string(), emitter.file)); println!( - "{:15}: {} bytes, {} codepoints in {} ranges ({} - {})", + "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", property, emitter.bytes_used, datapoints, ranges.len(), ranges.first().unwrap().start, - ranges.last().unwrap().end + ranges.last().unwrap().end, + emitter.desc, ); total_bytes += emitter.bytes_used; } @@ -259,6 +261,7 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String let mut s = String::new(); s.push_str("#![allow(incomplete_features, unused)]\n"); s.push_str("#![feature(const_generics)]\n\n"); + s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); s.push_str(&format!("#[path = \"{}\"]\n", data_path)); s.push_str("mod unicode_data;\n\n"); @@ -267,7 +270,8 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String for (property, ranges) in ranges { s.push_str(&format!(r#" println!("Testing {}");"#, property)); s.push('\n'); - s.push_str(&format!(" {}();\n", property.to_lowercase())); + s.push_str(&format!(" {}_true();\n", property.to_lowercase())); + s.push_str(&format!(" {}_false();\n", property.to_lowercase())); let mut is_true = Vec::new(); let mut is_false = Vec::new(); for ch_num in 0..(std::char::MAX as u32) { @@ -281,8 +285,10 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String } } - s.push_str(&format!(" fn {}() {{\n", property.to_lowercase())); + s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); generate_asserts(&mut s, property, &is_true, true); + s.push_str(" }\n\n"); + s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); generate_asserts(&mut s, property, &is_false, false); s.push_str(" }\n\n"); } @@ -295,19 +301,19 @@ fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool for range in ranges_from_set(points) { if range.end == range.start + 1 { s.push_str(&format!( - " assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n", + " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", if truthy { "" } else { "!" }, property.to_lowercase(), - range.start, std::char::from_u32(range.start).unwrap(), - )); + range.start, + )); } else { s.push_str(&format!(" for chn in {:?}u32 {{\n", range)); s.push_str(&format!( " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", if truthy { "" } else { "!" }, property.to_lowercase(), - )); + )); s.push_str(" }\n"); } } @@ -323,17 +329,25 @@ fn merge_ranges(ranges: &mut Vec>) { loop { let mut new_ranges = Vec::new(); let mut idx_iter = 0..(ranges.len() - 1); + let mut should_insert_last = true; while let Some(idx) = idx_iter.next() { let cur = ranges[idx].clone(); let next = ranges[idx + 1].clone(); if cur.end == next.start { - let _ = idx_iter.next(); // skip next as we're merging it in + if idx_iter.next().is_none() { + // We're merging the last element + should_insert_last = false; + } new_ranges.push(cur.start..next.end); } else { + // We're *not* merging the last element + should_insert_last = true; new_ranges.push(cur); } } - new_ranges.push(ranges.last().unwrap().clone()); + if should_insert_last { + new_ranges.push(ranges.last().unwrap().clone()); + } if new_ranges.len() == ranges.len() { *ranges = new_ranges; break; @@ -341,4 +355,12 @@ fn merge_ranges(ranges: &mut Vec>) { *ranges = new_ranges; } } + + let mut last_end = None; + for range in ranges { + if let Some(last) = last_end { + assert!(range.start > last, "{:?}", range); + } + last_end = Some(range.end); + } } diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index b57fd2c1d86..49e65521c98 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -1,5 +1,5 @@ #[inline(always)] -fn range_search< +fn bitset_search< const N: usize, const CHUNK_SIZE: usize, const N1: usize, @@ -47,3 +47,52 @@ fn range_search< }; (word & (1 << (needle % 64) as u64)) != 0 } + +fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { + short_offset_run_header & ((1 << 21) - 1) +} + +fn decode_length(short_offset_run_header: u32) -> usize { + (short_offset_run_header >> 21) as usize +} + +#[inline(always)] +fn skip_search( + needle: u32, + short_offset_runs: &[u32; SOR], + offsets: &[u8; OFFSETS], +) -> bool { + // Note that this *cannot* be past the end of the array, as the last + // element is greater than std::char::MAX (the largest possible needle). + // + // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct + // location cannot be past it, so Err(idx) != length either. + // + // This means that we can avoid bounds checking for the accesses below, too. + let last_idx = + match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }; + + let mut offset_idx = decode_length(short_offset_runs[last_idx]); + let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { + decode_length(*next) - offset_idx + } else { + offsets.len() - offset_idx + }; + let prev = + last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0); + + let total = needle - prev; + let mut prefix_sum = 0; + for _ in 0..(length - 1) { + let offset = offsets[offset_idx]; + prefix_sum += offset as u32; + if prefix_sum > total { + break; + } + offset_idx += 1; + } + offset_idx % 2 == 1 +} diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index e5b15224795..db9d04b3fa9 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -46,12 +46,13 @@ #[derive(Clone)] pub struct RawEmitter { pub file: String, + pub desc: String, pub bytes_used: usize, } impl RawEmitter { pub fn new() -> RawEmitter { - RawEmitter { file: String::new(), bytes_used: 0 } + RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() } } fn blank_line(&mut self) { @@ -61,8 +62,21 @@ fn blank_line(&mut self) { writeln!(&mut self.file, "").unwrap(); } - fn emit_bitset(&mut self, words: &[u64]) { - let mut words = words.to_vec(); + fn emit_bitset(&mut self, ranges: &[Range]) { + let last_code_point = ranges.last().unwrap().end; + // bitset for every bit in the codepoint range + // + // + 2 to ensure an all zero word to use for padding + let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; + for range in ranges { + for codepoint in range.clone() { + let bucket = codepoint as usize / 64; + let bit = codepoint as u64 % 64; + buckets[bucket] |= 1 << bit; + } + } + + let mut words = buckets; // Ensure that there's a zero word in the dataset, used for padding and // such. words.push(0); @@ -118,6 +132,19 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // We only need it for the words that we removed by applying a shift and // flip to them. self.bytes_used += 2 * canonicalized.canonicalized_words.len(); + + self.blank_line(); + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::bitset_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); + writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); + writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); + writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); + writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); } fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { @@ -184,40 +211,24 @@ fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: .unwrap(); self.bytes_used += chunk_length * chunks.len(); } - - pub fn emit_lookup(&mut self) { - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " super::range_search(",).unwrap(); - writeln!(&mut self.file, " c as u32,").unwrap(); - writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); - writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); - writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); - writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); - writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); - writeln!(&mut self.file, " )").unwrap(); - writeln!(&mut self.file, "}}").unwrap(); - } } pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { emitter.blank_line(); - let last_code_point = ranges.last().unwrap().end; - // bitset for every bit in the codepoint range - // - // + 2 to ensure an all zero word to use for padding - let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; - for range in ranges { - for codepoint in range.clone() { - let bucket = codepoint as usize / 64; - let bit = codepoint as u64 % 64; - buckets[bucket] |= 1 << bit; - } - } + let mut bitset = emitter.clone(); + bitset.emit_bitset(&ranges); - emitter.emit_bitset(&buckets); - emitter.blank_line(); - emitter.emit_lookup(); + let mut skiplist = emitter.clone(); + skiplist.emit_skiplist(&ranges); + + if bitset.bytes_used <= skiplist.bytes_used { + *emitter = bitset; + emitter.desc = format!("bitset"); + } else { + *emitter = skiplist; + emitter.desc = format!("skiplist"); + } } struct Canonicalized { diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs new file mode 100644 index 00000000000..6e439968c3b --- /dev/null +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -0,0 +1,98 @@ +use crate::fmt_list; +use crate::raw_emitter::RawEmitter; +use std::convert::TryInto; +use std::fmt::Write as _; +use std::ops::Range; + +/// This will get packed into a single u32 before inserting into the data set. +#[derive(Debug, PartialEq)] +struct ShortOffsetRunHeader { + /// Note, we only allow for 21 bits here. + prefix_sum: u32, + + /// Note, we actually only allow for 11 bits here. This should be enough -- + /// our largest sets are around ~1400 offsets long. + start_idx: u16, +} + +impl ShortOffsetRunHeader { + fn pack(&self) -> u32 { + assert!(self.start_idx < (1 << 11)); + assert!(self.prefix_sum < (1 << 21)); + + (self.start_idx as u32) << 21 | self.prefix_sum + } +} + +impl RawEmitter { + pub fn emit_skiplist(&mut self, ranges: &[Range]) { + let mut offsets = Vec::::new(); + let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::>(); + let mut offset = 0; + for pt in points { + let delta = pt - offset; + offsets.push(delta); + offset = pt; + } + // Guaranteed to terminate, as it's impossible to subtract a value this + // large from a valid char. + offsets.push(std::char::MAX as u32 + 1); + let mut coded_offsets: Vec = Vec::new(); + let mut short_offset_runs: Vec = vec![]; + let mut iter = offsets.iter().cloned(); + let mut prefix_sum = 0; + loop { + let mut any_elements = false; + let mut inserted = false; + let start = coded_offsets.len(); + for offset in iter.by_ref() { + any_elements = true; + prefix_sum += offset; + if let Ok(offset) = offset.try_into() { + coded_offsets.push(offset); + } else { + short_offset_runs.push(ShortOffsetRunHeader { + start_idx: start.try_into().unwrap(), + prefix_sum, + }); + // This is just needed to maintain indices even/odd + // correctly. + coded_offsets.push(0); + inserted = true; + break; + } + } + if !any_elements { + break; + } + // We always append the huge char::MAX offset to the end which + // should never be able to fit into the u8 offsets. + assert!(inserted); + } + + writeln!( + &mut self.file, + "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];", + short_offset_runs.len(), + fmt_list(short_offset_runs.iter().map(|v| v.pack())) + ) + .unwrap(); + self.bytes_used += 4 * short_offset_runs.len(); + writeln!( + &mut self.file, + "static OFFSETS: [u8; {}] = [{}];", + coded_offsets.len(), + fmt_list(&coded_offsets) + ) + .unwrap(); + self.bytes_used += coded_offsets.len(); + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::skip_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap(); + writeln!(&mut self.file, " &OFFSETS,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + } +}