Add a lower bound check to unicode-table-generator
output
This adds a dedicated check for the lower bound (if it is outside of ASCII range) to the output of the `unicode-table-generator` tool. This generalized the ASCII-only fast-path, but only for the `Grapheme_Extend` property for now, as that is the only one with a lower bound outside of ASCII.
This commit is contained in:
parent
ce3263e60e
commit
488598c183
@ -927,7 +927,7 @@ impl char {
|
|||||||
#[must_use]
|
#[must_use]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn is_grapheme_extended(self) -> bool {
|
pub(crate) fn is_grapheme_extended(self) -> bool {
|
||||||
self > '\x7f' && unicode::Grapheme_Extend(self)
|
unicode::Grapheme_Extend(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if this `char` has one of the general categories for numbers.
|
/// Returns `true` if this `char` has one of the general categories for numbers.
|
||||||
|
@ -315,7 +315,11 @@ pub mod grapheme_extend {
|
|||||||
15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 0, 7, 109, 7, 0, 96,
|
15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 0, 7, 109, 7, 0, 96,
|
||||||
128, 240, 0,
|
128, 240, 0,
|
||||||
];
|
];
|
||||||
|
#[inline]
|
||||||
pub fn lookup(c: char) -> bool {
|
pub fn lookup(c: char) -> bool {
|
||||||
|
(c as u32) >= 0x300 && lookup_slow(c)
|
||||||
|
}
|
||||||
|
fn lookup_slow(c: char) -> bool {
|
||||||
super::skip_search(
|
super::skip_search(
|
||||||
c as u32,
|
c as u32,
|
||||||
&SHORT_OFFSET_RUNS,
|
&SHORT_OFFSET_RUNS,
|
||||||
|
@ -23,6 +23,7 @@ impl RawEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
|
fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
|
||||||
|
let first_code_point = ranges.first().unwrap().start;
|
||||||
let last_code_point = ranges.last().unwrap().end;
|
let last_code_point = ranges.last().unwrap().end;
|
||||||
// bitset for every bit in the codepoint range
|
// bitset for every bit in the codepoint range
|
||||||
//
|
//
|
||||||
@ -101,7 +102,10 @@ impl RawEmitter {
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
|
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
|
||||||
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
|
if first_code_point > 0x7f {
|
||||||
|
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
|
||||||
|
}
|
||||||
|
writeln!(&mut self.file, " super::bitset_search(").unwrap();
|
||||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||||
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
||||||
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
||||||
|
@ -25,8 +25,9 @@ impl ShortOffsetRunHeader {
|
|||||||
|
|
||||||
impl RawEmitter {
|
impl RawEmitter {
|
||||||
pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
|
pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
|
||||||
|
let first_code_point = ranges.first().unwrap().start;
|
||||||
let mut offsets = Vec::<u32>::new();
|
let mut offsets = Vec::<u32>::new();
|
||||||
let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
|
let points = ranges.iter().flat_map(|r| [r.start, r.end]).collect::<Vec<u32>>();
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
for pt in points {
|
for pt in points {
|
||||||
let delta = pt - offset;
|
let delta = pt - offset;
|
||||||
@ -86,7 +87,26 @@ impl RawEmitter {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
self.bytes_used += coded_offsets.len();
|
self.bytes_used += coded_offsets.len();
|
||||||
|
|
||||||
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
// The inlining in this code works like the following:
|
||||||
|
//
|
||||||
|
// The `skip_search` function is always inlined into the parent `lookup` fn,
|
||||||
|
// thus the compiler can generate optimal code based on the referenced `static`s.
|
||||||
|
//
|
||||||
|
// In the case of ASCII optimization, the lower-bounds check is inlined into
|
||||||
|
// the caller, and slower-path `skip_search` is outlined into a separate `lookup_slow` fn.
|
||||||
|
//
|
||||||
|
// Thus, in both cases, the `skip_search` function is specialized for the `static`s,
|
||||||
|
// and outlined into the prebuilt `std`.
|
||||||
|
if first_code_point > 0x7f {
|
||||||
|
writeln!(&mut self.file, "#[inline]").unwrap();
|
||||||
|
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
||||||
|
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
|
||||||
|
.unwrap();
|
||||||
|
writeln!(&mut self.file, "}}").unwrap();
|
||||||
|
writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
|
||||||
|
} else {
|
||||||
|
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
||||||
|
}
|
||||||
writeln!(&mut self.file, " super::skip_search(",).unwrap();
|
writeln!(&mut self.file, " super::skip_search(",).unwrap();
|
||||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||||
writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();
|
writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user