e7d16580f5
Use the same procedure as Python to determine whether a character is printable, described in [PEP 3138]. In particular, this means that the following character classes are escaped: - Cc (Other, Control) - Cf (Other, Format) - Cs (Other, Surrogate), even though they can't appear in Rust strings - Co (Other, Private Use) - Cn (Other, Not Assigned) - Zl (Separator, Line) - Zp (Separator, Paragraph) - Zs (Separator, Space), except for the ASCII space `' '` (`0x20`) This allows for user-friendly inspection of strings that are not English (e.g. compare `"\u{e9}\u{e8}\u{ea}"` to `"éèê"`). Fixes #34318. [PEP 3138]: https://www.python.org/dev/peps/pep-3138/
155 lines
4.4 KiB
Python
155 lines
4.4 KiB
Python
#!/usr/bin/env python
|
|
#
|
|
# Copyright 2011-2016 The Rust Project Developers. See the COPYRIGHT
|
|
# file at the top-level directory of this distribution and at
|
|
# http://rust-lang.org/COPYRIGHT.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
# option. This file may not be copied, modified, or distributed
|
|
# except according to those terms.
|
|
|
|
# This script uses the following Unicode tables:
|
|
# - Categories.txt
|
|
|
|
import os
|
|
import subprocess
|
|
|
|
def to_ranges(iter):
|
|
current = None
|
|
for i in iter:
|
|
if current is None or i != current[1] or i in (0x10000, 0x20000):
|
|
if current is not None:
|
|
yield tuple(current)
|
|
current = [i, i + 1]
|
|
else:
|
|
current[1] += 1
|
|
if current is not None:
|
|
yield tuple(current)
|
|
|
|
def get_escaped(dictionary):
|
|
for i in range(0x110000):
|
|
if dictionary.get(i, "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and i != ord(' '):
|
|
yield i
|
|
|
|
def get_file(f):
|
|
try:
|
|
return open(os.path.basename(f))
|
|
except FileNotFoundError:
|
|
subprocess.run(["curl", "-O", f], check=True)
|
|
return open(os.path.basename(f))
|
|
|
|
def main():
|
|
file = get_file("http://www.unicode.org/notes/tn36/Categories.txt")
|
|
|
|
dictionary = {int(line.split()[0], 16): line.split()[1] for line in file}
|
|
|
|
CUTOFF=0x10000
|
|
singletons0 = []
|
|
singletons1 = []
|
|
normal0 = []
|
|
normal1 = []
|
|
extra = []
|
|
|
|
for a, b in to_ranges(get_escaped(dictionary)):
|
|
if a > 2 * CUTOFF:
|
|
extra.append((a, b - a))
|
|
elif a == b - 1:
|
|
if a & CUTOFF:
|
|
singletons1.append(a & ~CUTOFF)
|
|
else:
|
|
singletons0.append(a)
|
|
elif a == b - 2:
|
|
if a & CUTOFF:
|
|
singletons1.append(a & ~CUTOFF)
|
|
singletons1.append((a + 1) & ~CUTOFF)
|
|
else:
|
|
singletons0.append(a)
|
|
singletons0.append(a + 1)
|
|
else:
|
|
if a >= 2 * CUTOFF:
|
|
extra.append((a, b - a))
|
|
elif a & CUTOFF:
|
|
normal1.append((a & ~CUTOFF, b - a))
|
|
else:
|
|
normal0.append((a, b - a))
|
|
|
|
print("""\
|
|
// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
|
|
// file at the top-level directory of this distribution and at
|
|
// http://rust-lang.org/COPYRIGHT.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
// option. This file may not be copied, modified, or distributed
|
|
// except according to those terms.
|
|
|
|
// NOTE: The following code was generated by "src/etc/char_private.py",
|
|
// do not edit directly!
|
|
|
|
use slice::SliceExt;
|
|
|
|
fn check(x: u16, singletons: &[u16], normal: &[u16]) -> bool {
|
|
for &s in singletons {
|
|
if x == s {
|
|
return false;
|
|
} else if x < s {
|
|
break;
|
|
}
|
|
}
|
|
for w in normal.chunks(2) {
|
|
let start = w[0];
|
|
let len = w[1];
|
|
let difference = (x as i32) - (start as i32);
|
|
if 0 <= difference {
|
|
if difference < len as i32 {
|
|
return false;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
pub fn is_printable(x: char) -> bool {
|
|
let x = x as u32;
|
|
let lower = x as u16;
|
|
if x < 0x10000 {
|
|
check(lower, SINGLETONS0, NORMAL0)
|
|
} else if x < 0x20000 {
|
|
check(lower, SINGLETONS1, NORMAL1)
|
|
} else {\
|
|
""")
|
|
for a, b in extra:
|
|
print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
|
|
print(" return false;")
|
|
print(" }")
|
|
print("""\
|
|
true
|
|
}
|
|
}\
|
|
""")
|
|
print()
|
|
print("const SINGLETONS0: &'static [u16] = &[")
|
|
for s in singletons0:
|
|
print(" 0x{:x},".format(s))
|
|
print("];")
|
|
print("const SINGLETONS1: &'static [u16] = &[")
|
|
for s in singletons1:
|
|
print(" 0x{:x},".format(s))
|
|
print("];")
|
|
print("const NORMAL0: &'static [u16] = &[")
|
|
for a, b in normal0:
|
|
print(" 0x{:x}, 0x{:x},".format(a, b))
|
|
print("];")
|
|
print("const NORMAL1: &'static [u16] = &[")
|
|
for a, b in normal1:
|
|
print(" 0x{:x}, 0x{:x},".format(a, b))
|
|
print("];")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|