Let unicode-table-generator fail gracefully for bitsets

The "Alphabetic" property in Unicode 14 grew too big for the bitset
representation, panicking "cannot pack 264 into 8 bits". However, we
were already choosing the skiplist for that anyway, so this doesn't need
to be a hard failure. That panic is now a returned `Err`, and then in
`emit_codepoints` we automatically defer to skiplist.
This commit is contained in:
Josh Stone 2021-10-06 17:35:49 -07:00
parent e159d42a9a
commit 6b0b417299

View File

@ -23,7 +23,7 @@ fn blank_line(&mut self) {
writeln!(&mut self.file).unwrap(); writeln!(&mut self.file).unwrap();
} }
fn emit_bitset(&mut self, ranges: &[Range<u32>]) { fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
let last_code_point = ranges.last().unwrap().end; let last_code_point = ranges.last().unwrap().end;
// bitset for every bit in the codepoint range // bitset for every bit in the codepoint range
// //
@ -44,7 +44,7 @@ fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
let unique_words = let unique_words =
words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>(); words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
if unique_words.len() > u8::MAX as usize { if unique_words.len() > u8::MAX as usize {
panic!("cannot pack {} into 8 bits", unique_words.len()); return Err(format!("cannot pack {} into 8 bits", unique_words.len()));
} }
// needed for the chunk mapping to work // needed for the chunk mapping to work
assert_eq!(unique_words[0], 0, "has a zero word"); assert_eq!(unique_words[0], 0, "has a zero word");
@ -105,6 +105,8 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
writeln!(&mut self.file, " )").unwrap(); writeln!(&mut self.file, " )").unwrap();
writeln!(&mut self.file, "}}").unwrap(); writeln!(&mut self.file, "}}").unwrap();
Ok(())
} }
fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
@ -154,12 +156,12 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
emitter.blank_line(); emitter.blank_line();
let mut bitset = emitter.clone(); let mut bitset = emitter.clone();
bitset.emit_bitset(&ranges); let bitset_ok = bitset.emit_bitset(&ranges).is_ok();
let mut skiplist = emitter.clone(); let mut skiplist = emitter.clone();
skiplist.emit_skiplist(&ranges); skiplist.emit_skiplist(&ranges);
if bitset.bytes_used <= skiplist.bytes_used { if bitset_ok && bitset.bytes_used <= skiplist.bytes_used {
*emitter = bitset; *emitter = bitset;
emitter.desc = String::from("bitset"); emitter.desc = String::from("bitset");
} else { } else {