Let unicode-table-generator fail gracefully for bitsets

The "Alphabetic" property in Unicode 14 grew too big for the bitset
representation, panicking "cannot pack 264 into 8 bits". However, we
were already choosing the skiplist for that anyway, so this doesn't need
to be a hard failure. That panic is now a returned `Err`, and then in
`emit_codepoints` we automatically defer to skiplist.
This commit is contained in:
Josh Stone 2021-10-06 17:35:49 -07:00
parent e159d42a9a
commit 6b0b417299

View File

@ -23,7 +23,7 @@ fn blank_line(&mut self) {
writeln!(&mut self.file).unwrap();
}
fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
let last_code_point = ranges.last().unwrap().end;
// bitset for every bit in the codepoint range
//
@ -44,7 +44,7 @@ fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
let unique_words =
words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
if unique_words.len() > u8::MAX as usize {
panic!("cannot pack {} into 8 bits", unique_words.len());
return Err(format!("cannot pack {} into 8 bits", unique_words.len()));
}
// needed for the chunk mapping to work
assert_eq!(unique_words[0], 0, "has a zero word");
@ -105,6 +105,8 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
writeln!(&mut self.file, " )").unwrap();
writeln!(&mut self.file, "}}").unwrap();
Ok(())
}
fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
@ -154,12 +156,12 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
emitter.blank_line();
let mut bitset = emitter.clone();
bitset.emit_bitset(&ranges);
let bitset_ok = bitset.emit_bitset(&ranges).is_ok();
let mut skiplist = emitter.clone();
skiplist.emit_skiplist(&ranges);
if bitset.bytes_used <= skiplist.bytes_used {
if bitset_ok && bitset.bytes_used <= skiplist.bytes_used {
*emitter = bitset;
emitter.desc = String::from("bitset");
} else {