Add unicode table generator
This commit is contained in:
parent
8a87b945b2
commit
064f8885d5
9
.gitignore
vendored
9
.gitignore
vendored
@ -34,14 +34,7 @@ __pycache__/
|
|||||||
# Created by default with `src/ci/docker/run.sh`:
|
# Created by default with `src/ci/docker/run.sh`:
|
||||||
/obj/
|
/obj/
|
||||||
/rustllvm/
|
/rustllvm/
|
||||||
/src/libcore/unicode/DerivedCoreProperties.txt
|
/unicode-downloads
|
||||||
/src/libcore/unicode/DerivedNormalizationProps.txt
|
|
||||||
/src/libcore/unicode/PropList.txt
|
|
||||||
/src/libcore/unicode/ReadMe.txt
|
|
||||||
/src/libcore/unicode/Scripts.txt
|
|
||||||
/src/libcore/unicode/SpecialCasing.txt
|
|
||||||
/src/libcore/unicode/UnicodeData.txt
|
|
||||||
/src/libcore/unicode/downloaded
|
|
||||||
/target/
|
/target/
|
||||||
# Generated by compiletest for incremental:
|
# Generated by compiletest for incremental:
|
||||||
/tmp/
|
/tmp/
|
||||||
|
17
Cargo.lock
17
Cargo.lock
@ -4930,6 +4930,16 @@ version = "1.10.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169"
|
checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ucd-parse"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ca6b52bf4da6512f0f07785a04769222e50d29639e7ecd016b7806fd2de306b4"
|
||||||
|
dependencies = [
|
||||||
|
"lazy_static 1.3.0",
|
||||||
|
"regex",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ucd-trie"
|
name = "ucd-trie"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@ -4951,6 +4961,13 @@ dependencies = [
|
|||||||
"version_check 0.1.5",
|
"version_check 0.1.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-bdd"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"ucd-parse",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-bidi"
|
name = "unicode-bidi"
|
||||||
version = "0.3.4"
|
version = "0.3.4"
|
||||||
|
@ -23,6 +23,7 @@ members = [
|
|||||||
"src/tools/rustfmt",
|
"src/tools/rustfmt",
|
||||||
"src/tools/miri",
|
"src/tools/miri",
|
||||||
"src/tools/rustdoc-themes",
|
"src/tools/rustdoc-themes",
|
||||||
|
"src/tools/unicode-table-generator",
|
||||||
]
|
]
|
||||||
exclude = [
|
exclude = [
|
||||||
"build",
|
"build",
|
||||||
|
10
src/tools/unicode-table-generator/Cargo.toml
Normal file
10
src/tools/unicode-table-generator/Cargo.toml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[package]
|
||||||
|
name = "unicode-bdd"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Mark Rousskov <mark.simulacrum@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
ucd-parse = "0.1.3"
|
62
src/tools/unicode-table-generator/src/case_mapping.rs
Normal file
62
src/tools/unicode-table-generator/src/case_mapping.rs
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
use crate::{fmt_list, UnicodeData};
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
|
||||||
|
let mut file = String::new();
|
||||||
|
|
||||||
|
file.push_str(HEADER.trim_start());
|
||||||
|
|
||||||
|
let decl_type = "&[(char, [char; 3])]";
|
||||||
|
|
||||||
|
file.push_str(&format!(
|
||||||
|
"static LOWERCASE_TABLE: {} = &[{}];",
|
||||||
|
decl_type,
|
||||||
|
fmt_list(data.to_lower.iter().map(to_mapping))
|
||||||
|
));
|
||||||
|
file.push_str("\n\n");
|
||||||
|
file.push_str(&format!(
|
||||||
|
"static UPPERCASE_TABLE: {} = &[{}];",
|
||||||
|
decl_type,
|
||||||
|
fmt_list(data.to_upper.iter().map(to_mapping))
|
||||||
|
));
|
||||||
|
file
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) {
|
||||||
|
(
|
||||||
|
CharEscape(std::char::from_u32(*key).unwrap()),
|
||||||
|
[
|
||||||
|
CharEscape(std::char::from_u32(*a).unwrap()),
|
||||||
|
CharEscape(std::char::from_u32(*b).unwrap()),
|
||||||
|
CharEscape(std::char::from_u32(*c).unwrap()),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
struct CharEscape(char);
|
||||||
|
|
||||||
|
impl fmt::Debug for CharEscape {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(f, "'{}'", self.0.escape_default())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static HEADER: &str = "
|
||||||
|
pub fn to_lower(c: char) -> [char; 3] {
|
||||||
|
match bsearch_case_table(c, LOWERCASE_TABLE) {
|
||||||
|
None => [c, '\\0', '\\0'],
|
||||||
|
Some(index) => LOWERCASE_TABLE[index].1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_upper(c: char) -> [char; 3] {
|
||||||
|
match bsearch_case_table(c, UPPERCASE_TABLE) {
|
||||||
|
None => [c, '\\0', '\\0'],
|
||||||
|
Some(index) => UPPERCASE_TABLE[index].1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
|
||||||
|
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
|
||||||
|
}
|
||||||
|
";
|
261
src/tools/unicode-table-generator/src/main.rs
Normal file
261
src/tools/unicode-table-generator/src/main.rs
Normal file
@ -0,0 +1,261 @@
|
|||||||
|
use std::collections::{BTreeMap, HashMap};
|
||||||
|
use std::ops::Range;
|
||||||
|
use ucd_parse::Codepoints;
|
||||||
|
|
||||||
|
mod case_mapping;
|
||||||
|
mod raw_emitter;
|
||||||
|
mod unicode_download;
|
||||||
|
|
||||||
|
use raw_emitter::{emit_codepoints, RawEmitter};
|
||||||
|
|
||||||
|
static PROPERTIES: &[&str] = &[
|
||||||
|
"Alphabetic",
|
||||||
|
"Lowercase",
|
||||||
|
"Uppercase",
|
||||||
|
"Cased",
|
||||||
|
"Case_Ignorable",
|
||||||
|
"Grapheme_Extend",
|
||||||
|
"White_Space",
|
||||||
|
"Cc",
|
||||||
|
"N",
|
||||||
|
];
|
||||||
|
|
||||||
|
struct UnicodeData {
|
||||||
|
ranges: Vec<(&'static str, Vec<Range<u32>>)>,
|
||||||
|
to_upper: BTreeMap<u32, (u32, u32, u32)>,
|
||||||
|
to_lower: BTreeMap<u32, (u32, u32, u32)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
|
||||||
|
let mut a = None;
|
||||||
|
let mut b = None;
|
||||||
|
let mut c = None;
|
||||||
|
|
||||||
|
for codepoint in codepoints {
|
||||||
|
if origin == codepoint.value() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
if a.is_none() {
|
||||||
|
a = Some(codepoint.value());
|
||||||
|
} else if b.is_none() {
|
||||||
|
b = Some(codepoint.value());
|
||||||
|
} else if c.is_none() {
|
||||||
|
c = Some(codepoint.value());
|
||||||
|
} else {
|
||||||
|
panic!("more than 3 mapped codepoints")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
|
||||||
|
}
|
||||||
|
|
||||||
|
static UNICODE_DIRECTORY: &str = "unicode-downloads";
|
||||||
|
|
||||||
|
fn load_data() -> UnicodeData {
|
||||||
|
unicode_download::fetch_latest();
|
||||||
|
|
||||||
|
let mut properties = HashMap::new();
|
||||||
|
for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
|
||||||
|
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
|
||||||
|
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
|
||||||
|
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
|
||||||
|
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut to_lower = BTreeMap::new();
|
||||||
|
let mut to_upper = BTreeMap::new();
|
||||||
|
for row in ucd_parse::UnicodeDataExpander::new(
|
||||||
|
ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
|
||||||
|
) {
|
||||||
|
let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
|
||||||
|
"N"
|
||||||
|
} else {
|
||||||
|
row.general_category.as_str()
|
||||||
|
};
|
||||||
|
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
|
||||||
|
properties
|
||||||
|
.entry(*name)
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.push(Codepoints::Single(row.codepoint));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(mapped) = row.simple_lowercase_mapping {
|
||||||
|
if mapped != row.codepoint {
|
||||||
|
to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(mapped) = row.simple_uppercase_mapping {
|
||||||
|
if mapped != row.codepoint {
|
||||||
|
to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
|
||||||
|
if !row.conditions.is_empty() {
|
||||||
|
// Skip conditional case mappings
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let key = row.codepoint.value();
|
||||||
|
if let Some(lower) = to_mapping(key, row.lowercase) {
|
||||||
|
to_lower.insert(key, lower);
|
||||||
|
}
|
||||||
|
if let Some(upper) = to_mapping(key, row.uppercase) {
|
||||||
|
to_upper.insert(key, upper);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
|
||||||
|
.into_iter()
|
||||||
|
.map(|(k, v)| {
|
||||||
|
(
|
||||||
|
k,
|
||||||
|
v.into_iter()
|
||||||
|
.flat_map(|codepoints| match codepoints {
|
||||||
|
Codepoints::Single(c) => c
|
||||||
|
.scalar()
|
||||||
|
.map(|ch| (ch as u32..ch as u32 + 1))
|
||||||
|
.into_iter()
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
Codepoints::Range(c) => c
|
||||||
|
.into_iter()
|
||||||
|
.flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
})
|
||||||
|
.collect::<Vec<Range<u32>>>(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
for ranges in properties.values_mut() {
|
||||||
|
merge_ranges(ranges);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut properties = properties.into_iter().collect::<Vec<_>>();
|
||||||
|
properties.sort_by_key(|p| p.0);
|
||||||
|
UnicodeData { ranges: properties, to_lower, to_upper }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let write_location = std::env::args().nth(1).unwrap_or_else(|| {
|
||||||
|
eprintln!("Must provide path to write unicode tables to");
|
||||||
|
eprintln!(
|
||||||
|
"e.g. {} src/libcore/unicode/unicode_data.rs",
|
||||||
|
std::env::args().nth(0).unwrap_or_default()
|
||||||
|
);
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
let unicode_data = load_data();
|
||||||
|
let ranges_by_property = &unicode_data.ranges;
|
||||||
|
|
||||||
|
let mut total_bytes = 0;
|
||||||
|
let mut modules = Vec::new();
|
||||||
|
for (property, ranges) in ranges_by_property {
|
||||||
|
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
|
||||||
|
let mut emitter = RawEmitter::new();
|
||||||
|
emit_codepoints(&mut emitter, &ranges);
|
||||||
|
|
||||||
|
modules.push((property.to_lowercase().to_string(), emitter.file));
|
||||||
|
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
|
||||||
|
total_bytes += emitter.bytes_used;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut table_file = String::new();
|
||||||
|
|
||||||
|
table_file.push_str(
|
||||||
|
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
|
||||||
|
);
|
||||||
|
|
||||||
|
table_file.push_str("use super::range_search;\n\n");
|
||||||
|
|
||||||
|
table_file.push_str(&version());
|
||||||
|
|
||||||
|
table_file.push('\n');
|
||||||
|
|
||||||
|
modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
|
||||||
|
|
||||||
|
for (name, contents) in modules {
|
||||||
|
table_file.push_str("#[rustfmt::skip]\n");
|
||||||
|
table_file.push_str(&format!("pub mod {} {{\n", name));
|
||||||
|
for line in contents.lines() {
|
||||||
|
if !line.trim().is_empty() {
|
||||||
|
table_file.push_str(" ");
|
||||||
|
table_file.push_str(&line);
|
||||||
|
}
|
||||||
|
table_file.push('\n');
|
||||||
|
}
|
||||||
|
table_file.push_str("}\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
|
||||||
|
|
||||||
|
println!("Total table sizes: {} bytes", total_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn version() -> String {
|
||||||
|
let mut out = String::new();
|
||||||
|
out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = ");
|
||||||
|
|
||||||
|
let readme =
|
||||||
|
std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let prefix = "for Version ";
|
||||||
|
let start = readme.find(prefix).unwrap() + prefix.len();
|
||||||
|
let end = readme.find(" of the Unicode Standard.").unwrap();
|
||||||
|
let version =
|
||||||
|
readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
|
||||||
|
let [major, minor, micro] = [version[0], version[1], version[2]];
|
||||||
|
|
||||||
|
out.push_str(&format!("({}, {}, {});\n", major, minor, micro));
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
|
||||||
|
let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
|
||||||
|
let mut out = String::new();
|
||||||
|
let mut line = format!("\n ");
|
||||||
|
for piece in pieces {
|
||||||
|
if line.len() + piece.len() < 98 {
|
||||||
|
line.push_str(&piece);
|
||||||
|
} else {
|
||||||
|
out.push_str(line.trim_end());
|
||||||
|
out.push('\n');
|
||||||
|
line = format!(" {}", piece);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push_str(line.trim_end());
|
||||||
|
out.push('\n');
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
|
||||||
|
loop {
|
||||||
|
let mut new_ranges = Vec::new();
|
||||||
|
let mut idx_iter = 0..(ranges.len() - 1);
|
||||||
|
while let Some(idx) = idx_iter.next() {
|
||||||
|
let cur = ranges[idx].clone();
|
||||||
|
let next = ranges[idx + 1].clone();
|
||||||
|
if cur.end == next.start {
|
||||||
|
let _ = idx_iter.next(); // skip next as we're merging it in
|
||||||
|
new_ranges.push(cur.start..next.end);
|
||||||
|
} else {
|
||||||
|
new_ranges.push(cur);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
new_ranges.push(ranges.last().unwrap().clone());
|
||||||
|
if new_ranges.len() == ranges.len() {
|
||||||
|
*ranges = new_ranges;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
*ranges = new_ranges;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
170
src/tools/unicode-table-generator/src/raw_emitter.rs
Normal file
170
src/tools/unicode-table-generator/src/raw_emitter.rs
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
//! This implements the core logic of the compression scheme used to compactly
|
||||||
|
//! encode the Unicode character classes.
|
||||||
|
//!
|
||||||
|
//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
|
||||||
|
//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
|
||||||
|
//! over 17 kilobytes of data per character set -- way too much for our
|
||||||
|
//! purposes.
|
||||||
|
//!
|
||||||
|
//! We have two primary goals with the encoding: we want to be compact, because
|
||||||
|
//! these tables often end up in ~every Rust program (especially the
|
||||||
|
//! grapheme_extend table, used for str debugging), including those for embedded
|
||||||
|
//! targets (where space is important). We also want to be relatively fast,
|
||||||
|
//! though this is more of a nice to have rather than a key design constraint.
|
||||||
|
//! In practice, due to modern processor design these two are closely related.
|
||||||
|
//!
|
||||||
|
//! The encoding scheme here compresses the bitset by first deduplicating the
|
||||||
|
//! "words" (64 bits on all platforms). In practice very few words are present
|
||||||
|
//! in most data sets.
|
||||||
|
//!
|
||||||
|
//! This gives us an array that maps `u8 -> word` (if we ever went beyond 256
|
||||||
|
//! words, we could go to u16 -> word or have some dual compression scheme
|
||||||
|
//! mapping into two separate sets; currently this is not dealt with).
|
||||||
|
//!
|
||||||
|
//! With that scheme, we now have a single byte for every 64 codepoints. We
|
||||||
|
//! further group these by 16 (arbitrarily chosen), and again deduplicate and
|
||||||
|
//! store in an array (u8 -> [u8; 16]).
|
||||||
|
//!
|
||||||
|
//! The indices into this array represent ranges of 64*16 = 1024 codepoints.
|
||||||
|
//!
|
||||||
|
//! This already reduces the top-level array to at most 1,086 bytes, but in
|
||||||
|
//! practice we usually can encode in far fewer (the first couple Unicode planes
|
||||||
|
//! are dense).
|
||||||
|
//!
|
||||||
|
//! The last byte of this top-level array is pulled out to a separate static
|
||||||
|
//! and trailing zeros are dropped; this is simply because grapheme_extend and
|
||||||
|
//! case_ignorable have a single entry in the 896th entry, so this shrinks them
|
||||||
|
//! down considerably.
|
||||||
|
|
||||||
|
use crate::fmt_list;
|
||||||
|
use std::collections::{BTreeSet, HashMap};
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
use std::fmt::Write;
|
||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
|
pub struct RawEmitter {
|
||||||
|
pub file: String,
|
||||||
|
pub bytes_used: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RawEmitter {
|
||||||
|
pub fn new() -> RawEmitter {
|
||||||
|
RawEmitter { file: String::new(), bytes_used: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn blank_line(&mut self) {
|
||||||
|
if self.file.is_empty() || self.file.ends_with("\n\n") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
writeln!(&mut self.file, "").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_bitset(&mut self, words: &[u64]) {
|
||||||
|
let unique_words =
|
||||||
|
words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
|
||||||
|
if unique_words.len() > u8::max_value() as usize {
|
||||||
|
panic!("cannot pack {} into 8 bits", unique_words.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
let word_indices = unique_words
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(idx, word)| (word, u8::try_from(idx).unwrap()))
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
|
||||||
|
let mut idx = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
|
||||||
|
let chunk_length = 16;
|
||||||
|
for _ in 0..(chunk_length - (idx.len() % chunk_length)) {
|
||||||
|
assert_eq!(unique_words[0], 0, "first word is all zeros");
|
||||||
|
// pad out bitset index with zero words so we have all chunks of 16
|
||||||
|
idx.push(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut chunks = BTreeSet::new();
|
||||||
|
for chunk in idx.chunks(chunk_length) {
|
||||||
|
chunks.insert(chunk);
|
||||||
|
}
|
||||||
|
let chunk_map = chunks
|
||||||
|
.clone()
|
||||||
|
.into_iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(idx, chunk)| (chunk, idx))
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
let mut chunk_indices = Vec::new();
|
||||||
|
for chunk in idx.chunks(chunk_length) {
|
||||||
|
chunk_indices.push(chunk_map[chunk]);
|
||||||
|
}
|
||||||
|
writeln!(
|
||||||
|
&mut self.file,
|
||||||
|
"static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
|
||||||
|
chunk_indices.len() - 1,
|
||||||
|
chunk_indices.pop().unwrap(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
self.bytes_used += 3;
|
||||||
|
// Strip out the empty pieces, presuming our above pop() made us now
|
||||||
|
// have some trailing zeros.
|
||||||
|
assert_eq!(unique_words[0], 0, "first word is all zeros");
|
||||||
|
while let Some(0) = chunk_indices.last() {
|
||||||
|
chunk_indices.pop();
|
||||||
|
}
|
||||||
|
writeln!(
|
||||||
|
&mut self.file,
|
||||||
|
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
|
||||||
|
chunk_indices.len(),
|
||||||
|
fmt_list(&chunk_indices),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
self.bytes_used += chunk_indices.len();
|
||||||
|
writeln!(
|
||||||
|
&mut self.file,
|
||||||
|
"static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];",
|
||||||
|
chunks.len(),
|
||||||
|
fmt_list(chunks.iter()),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
self.bytes_used += 16 * chunks.len();
|
||||||
|
writeln!(
|
||||||
|
&mut self.file,
|
||||||
|
"static BITSET: [u64; {}] = [{}];",
|
||||||
|
unique_words.len(),
|
||||||
|
fmt_list(&unique_words),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
self.bytes_used += 8 * unique_words.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn emit_lookup(&mut self) {
|
||||||
|
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
|
||||||
|
writeln!(&mut self.file, " super::range_search(",).unwrap();
|
||||||
|
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||||
|
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
||||||
|
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
|
||||||
|
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
||||||
|
writeln!(&mut self.file, " &BITSET,").unwrap();
|
||||||
|
writeln!(&mut self.file, " )").unwrap();
|
||||||
|
writeln!(&mut self.file, "}}").unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
|
||||||
|
emitter.blank_line();
|
||||||
|
|
||||||
|
let last_code_point = ranges.last().unwrap().end;
|
||||||
|
// bitset for every bit in the codepoint range
|
||||||
|
//
|
||||||
|
// + 2 to ensure an all zero word to use for padding
|
||||||
|
let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
|
||||||
|
for range in ranges {
|
||||||
|
for codepoint in range.clone() {
|
||||||
|
let bucket = codepoint as usize / 64;
|
||||||
|
let bit = codepoint as u64 % 64;
|
||||||
|
buckets[bucket] |= 1 << bit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
emitter.emit_bitset(&buckets);
|
||||||
|
emitter.blank_line();
|
||||||
|
emitter.emit_lookup();
|
||||||
|
}
|
42
src/tools/unicode-table-generator/src/unicode_download.rs
Normal file
42
src/tools/unicode-table-generator/src/unicode_download.rs
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
use crate::UNICODE_DIRECTORY;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/";
|
||||||
|
|
||||||
|
static README: &str = "ReadMe.txt";
|
||||||
|
|
||||||
|
static RESOURCES: &[&str] =
|
||||||
|
&["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"];
|
||||||
|
|
||||||
|
pub fn fetch_latest() {
|
||||||
|
let directory = Path::new(UNICODE_DIRECTORY);
|
||||||
|
if let Err(e) = std::fs::create_dir_all(directory) {
|
||||||
|
if e.kind() != std::io::ErrorKind::AlreadyExists {
|
||||||
|
panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap();
|
||||||
|
if !output.status.success() {
|
||||||
|
panic!(
|
||||||
|
"Failed to run curl to fetch readme: stderr: {}",
|
||||||
|
String::from_utf8_lossy(&output.stderr)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default();
|
||||||
|
if current.as_bytes() != &output.stdout[..] {
|
||||||
|
std::fs::write(directory.join(README), output.stdout).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
for resource in RESOURCES {
|
||||||
|
let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap();
|
||||||
|
if !output.status.success() {
|
||||||
|
panic!(
|
||||||
|
"Failed to run curl to fetch {}: stderr: {}",
|
||||||
|
resource,
|
||||||
|
String::from_utf8_lossy(&output.stderr)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
std::fs::write(directory.join(resource), output.stdout).unwrap();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user