rust/src/lib/ExtFmt.rs
2011-04-19 16:57:13 +02:00

557 lines
16 KiB
Rust

/* The 'fmt' extension is modeled on the posix printf system.
*
* A posix conversion ostensibly looks like this:
*
* %[parameter][flags][width][.precision][length]type
*
* Given the different numeric type bestiary we have, we omit the 'length'
* parameter and support slightly different conversions for 'type':
*
* %[parameter][flags][width][.precision]type
*
* we also only support translating-to-rust a tiny subset of the possible
* combinations at the moment.
*/
import option.none;
import option.some;
/*
* We have a CT (compile-time) module that parses format strings into a
* sequence of conversions. From those conversions AST fragments are built
* that call into properly-typed functions in the RT (run-time) module. Each
* of those run-time conversion functions accepts another conversion
* description that specifies how to format its output.
*
* The building of the AST is currently done in a module inside the compiler,
* but should migrate over here as the plugin interface is defined.
*/
// Functions used by the fmt extension at compile time
mod CT {
tag signedness {
signed;
unsigned;
}
tag caseness {
case_upper;
case_lower;
}
tag ty {
ty_bool;
ty_str;
ty_char;
ty_int(signedness);
ty_bits;
ty_hex(caseness);
// FIXME: More types
}
tag flag {
flag_left_justify;
flag_left_zero_pad;
flag_space_for_sign;
flag_sign_always;
flag_alternate;
}
tag count {
count_is(int);
count_is_param(int);
count_is_next_param;
count_implied;
}
// A formatted conversion from an expression to a string
type conv = rec(option.t[int] param,
vec[flag] flags,
count width,
count precision,
ty ty);
// A fragment of the output sequence
tag piece {
piece_string(str);
piece_conv(conv);
}
fn parse_fmt_string(str s) -> vec[piece] {
let vec[piece] pieces = vec();
auto lim = _str.byte_len(s);
auto buf = "";
fn flush_buf(str buf, &vec[piece] pieces) -> str {
if (_str.byte_len(buf) > 0u) {
auto piece = piece_string(buf);
pieces += vec(piece);
}
ret "";
}
auto i = 0u;
while (i < lim) {
auto curr = _str.substr(s, i, 1u);
if (_str.eq(curr, "%")) {
i += 1u;
if (i >= lim) {
log_err "unterminated conversion at end of string";
fail;
}
auto curr2 = _str.substr(s, i, 1u);
if (_str.eq(curr2, "%")) {
i += 1u;
} else {
buf = flush_buf(buf, pieces);
auto res = parse_conversion(s, i, lim);
pieces += vec(res._0);
i = res._1;
}
} else {
buf += curr;
i += 1u;
}
}
buf = flush_buf(buf, pieces);
ret pieces;
}
fn peek_num(str s, uint i, uint lim) -> option.t[tup(uint, uint)] {
if (i >= lim) {
ret none[tup(uint, uint)];
}
auto c = s.(i);
if (!('0' as u8 <= c && c <= '9' as u8)) {
ret option.none[tup(uint, uint)];
}
auto n = (c - ('0' as u8)) as uint;
alt (peek_num(s, i + 1u, lim)) {
case (none[tup(uint, uint)]) {
ret some[tup(uint, uint)](tup(n, i + 1u));
}
case (some[tup(uint, uint)](?next)) {
auto m = next._0;
auto j = next._1;
ret some[tup(uint, uint)](tup(n * 10u + m, j));
}
}
}
fn parse_conversion(str s, uint i, uint lim) -> tup(piece, uint) {
auto parm = parse_parameter(s, i, lim);
auto flags = parse_flags(s, parm._1, lim);
auto width = parse_count(s, flags._1, lim);
auto prec = parse_precision(s, width._1, lim);
auto ty = parse_type(s, prec._1, lim);
ret tup(piece_conv(rec(param = parm._0,
flags = flags._0,
width = width._0,
precision = prec._0,
ty = ty._0)),
ty._1);
}
fn parse_parameter(str s, uint i, uint lim) -> tup(option.t[int], uint) {
if (i >= lim) {
ret tup(none[int], i);
}
auto num = peek_num(s, i, lim);
alt (num) {
case (none[tup(uint, uint)]) {
ret tup(none[int], i);
}
case (some[tup(uint, uint)](?t)) {
auto n = t._0;
auto j = t._1;
if (j < lim && s.(j) == '$' as u8) {
ret tup(some[int](n as int), j + 1u);
}
else {
ret tup(none[int], i);
}
}
}
}
fn parse_flags(str s, uint i, uint lim) -> tup(vec[flag], uint) {
let vec[flag] noflags = vec();
if (i >= lim) {
ret tup(noflags, i);
}
fn more_(flag f, str s, uint i, uint lim) -> tup(vec[flag], uint) {
auto next = parse_flags(s, i + 1u, lim);
auto rest = next._0;
auto j = next._1;
let vec[flag] curr = vec(f);
ret tup(curr + rest, j);
}
auto more = bind more_(_, s, i, lim);
auto f = s.(i);
if (f == ('-' as u8)) {
ret more(flag_left_justify);
} else if (f == ('0' as u8)) {
ret more(flag_left_zero_pad);
} else if (f == (' ' as u8)) {
ret more(flag_space_for_sign);
} else if (f == ('+' as u8)) {
ret more(flag_sign_always);
} else if (f == ('#' as u8)) {
ret more(flag_alternate);
} else {
ret tup(noflags, i);
}
}
fn parse_count(str s, uint i, uint lim) -> tup(count, uint) {
if (i >= lim) {
ret tup(count_implied, i);
}
if (s.(i) == ('*' as u8)) {
auto param = parse_parameter(s, i + 1u, lim);
auto j = param._1;
alt (param._0) {
case (none[int]) {
ret tup(count_is_next_param, j);
}
case (some[int](?n)) {
ret tup(count_is_param(n), j);
}
}
} else {
auto num = peek_num(s, i, lim);
alt (num) {
case (none[tup(uint, uint)]) {
ret tup(count_implied, i);
}
case (some[tup(uint, uint)](?num)) {
ret tup(count_is(num._0 as int), num._1);
}
}
}
}
fn parse_precision(str s, uint i, uint lim) -> tup(count, uint) {
if (i >= lim) {
ret tup(count_implied, i);
}
if (s.(i) == '.' as u8) {
auto count = parse_count(s, i + 1u, lim);
// If there were no digits specified, i.e. the precision
// was ".", then the precision is 0
alt (count._0) {
case (count_implied) {
ret tup(count_is(0), count._1);
}
case (_) {
ret count;
}
}
} else {
ret tup(count_implied, i);
}
}
fn parse_type(str s, uint i, uint lim) -> tup(ty, uint) {
if (i >= lim) {
log_err "missing type in conversion";
fail;
}
auto t;
auto tstr = _str.substr(s, i, 1u);
if (_str.eq(tstr, "b")) {
t = ty_bool;
} else if (_str.eq(tstr, "s")) {
t = ty_str;
} else if (_str.eq(tstr, "c")) {
t = ty_char;
} else if (_str.eq(tstr, "d")
|| _str.eq(tstr, "i")) {
// TODO: Do we really want two signed types here?
// How important is it to be printf compatible?
t = ty_int(signed);
} else if (_str.eq(tstr, "u")) {
t = ty_int(unsigned);
} else if (_str.eq(tstr, "x")) {
t = ty_hex(case_lower);
} else if (_str.eq(tstr, "X")) {
t = ty_hex(case_upper);
} else if (_str.eq(tstr, "t")) {
t = ty_bits;
} else {
log_err "unknown type in conversion";
fail;
}
ret tup(t, i + 1u);
}
}
// Functions used by the fmt extension at runtime. For now there are a lot of
// decisions made a runtime. If it proves worthwhile then some of these
// conditions can be evaluated at compile-time. For now though it's cleaner to
// implement it this way, I think.
mod RT {
tag flag {
flag_left_justify;
flag_left_zero_pad;
flag_space_for_sign;
flag_sign_always;
flag_alternate;
// FIXME: This is a hack to avoid creating 0-length vec exprs,
// which have some difficulty typechecking currently. See
// comments in front.extfmt.make_flags
flag_none;
}
tag count {
count_is(int);
count_implied;
}
tag ty {
ty_default;
ty_bits;
ty_hex_upper;
ty_hex_lower;
}
// FIXME: May not want to use a vector here for flags;
// instead just use a bool per flag
type conv = rec(vec[flag] flags,
count width,
count precision,
ty ty);
fn conv_int(&conv cv, int i) -> str {
auto radix = 10u;
auto prec = get_int_precision(cv);
auto s = int_to_str_prec(i, radix, prec);
if (0 <= i) {
if (have_flag(cv.flags, flag_sign_always)) {
s = "+" + s;
} else if (have_flag(cv.flags, flag_space_for_sign)) {
s = " " + s;
}
}
ret pad(cv, s, pad_signed);
}
fn conv_uint(&conv cv, uint u) -> str {
auto prec = get_int_precision(cv);
auto res;
alt (cv.ty) {
case (ty_default) {
res = uint_to_str_prec(u, 10u, prec);
}
case (ty_hex_lower) {
res = uint_to_str_prec(u, 16u, prec);
}
case (ty_hex_upper) {
res = _str.to_upper(uint_to_str_prec(u, 16u, prec));
}
case (ty_bits) {
res = uint_to_str_prec(u, 2u, prec);
}
}
ret pad(cv, res, pad_unsigned);
}
fn conv_bool(&conv cv, bool b) -> str {
auto s;
if (b) {
s = "true";
} else {
s = "false";
}
// Run the boolean conversion through the string conversion logic,
// giving it the same rules for precision, etc.
ret conv_str(cv, s);
}
fn conv_char(&conv cv, char c) -> str {
ret conv_str(cv, _str.from_char(c));
}
fn conv_str(&conv cv, str s) -> str {
auto unpadded = s;
alt (cv.precision) {
case (count_implied) {
}
case (count_is(?max)) {
// For strings, precision is the maximum characters displayed
if (max as uint < _str.char_len(s)) {
// FIXME: substr works on bytes, not chars!
unpadded = _str.substr(s, 0u, max as uint);
}
}
}
ret pad(cv, unpadded, pad_nozero);
}
// Convert an int to string with minimum number of digits. If precision is
// 0 and num is 0 then the result is the empty string.
fn int_to_str_prec(int num, uint radix, uint prec) -> str {
if (num < 0) {
ret "-" + uint_to_str_prec((-num) as uint, radix, prec);
} else {
ret uint_to_str_prec(num as uint, radix, prec);
}
}
// Convert a uint to string with a minimum number of digits. If precision
// is 0 and num is 0 then the result is the empty string. Could move this
// to _uint, but it doesn't seem all that useful.
fn uint_to_str_prec(uint num, uint radix, uint prec) -> str {
auto s;
if (prec == 0u && num == 0u) {
s = "";
} else {
s = _uint.to_str(num, radix);
auto len = _str.char_len(s);
if (len < prec) {
auto diff = prec - len;
auto pad = str_init_elt('0', diff);
s = pad + s;
}
}
ret s;
}
fn get_int_precision(&conv cv) -> uint {
alt (cv.precision) {
case (count_is(?c)) {
ret c as uint;
}
case (count_implied) {
ret 1u;
}
}
}
// FIXME: This might be useful in _str, but needs to be utf8 safe first
fn str_init_elt(char c, uint n_elts) -> str {
auto svec = _vec.init_elt[u8](c as u8, n_elts);
// FIXME: Using unsafe_from_bytes because rustboot
// can't figure out the is_utf8 predicate on from_bytes?
ret _str.unsafe_from_bytes(svec);
}
tag pad_mode {
pad_signed;
pad_unsigned;
pad_nozero;
}
fn pad(&conv cv, str s, pad_mode mode) -> str {
auto uwidth;
alt (cv.width) {
case (count_implied) {
ret s;
}
case (count_is(?width)) {
// FIXME: Maybe width should be uint
uwidth = width as uint;
}
}
auto strlen = _str.char_len(s);
if (uwidth <= strlen) {
ret s;
}
auto padchar = ' ';
auto diff = uwidth - strlen;
if (have_flag(cv.flags, flag_left_justify)) {
auto padstr = str_init_elt(padchar, diff);
ret s + padstr;
}
auto might_zero_pad = false;
auto signed = false;
alt (mode) {
case (pad_nozero) {
// fallthrough
}
case (pad_signed) {
might_zero_pad = true;
signed = true;
}
case (pad_unsigned) {
might_zero_pad = true;
}
}
fn have_precision(&conv cv) -> bool {
alt (cv.precision) {
case (count_implied) {
ret false;
}
case (_) {
ret true;
}
}
}
auto zero_padding = false;
if (might_zero_pad
&& have_flag(cv.flags, flag_left_zero_pad)
&& !have_precision(cv)) {
padchar = '0';
zero_padding = true;
}
auto padstr = str_init_elt(padchar, diff);
// This is completely heinous. If we have a signed value then
// potentially rip apart the intermediate result and insert some
// zeros. It may make sense to convert zero padding to a precision
// instead.
if (signed
&& zero_padding
&& _str.byte_len(s) > 0u
&& s.(0) == '-' as u8) {
auto bytelen = _str.byte_len(s);
auto numpart = _str.substr(s, 1u, bytelen - 1u);
ret "-" + padstr + numpart;
} else {
ret padstr + s;
}
}
fn have_flag(vec[flag] flags, flag f) -> bool {
for (flag candidate in flags) {
if (candidate == f) {
ret true;
}
}
ret false;
}
}
// Local Variables:
// mode: rust;
// fill-column: 78;
// indent-tabs-mode: nil
// c-basic-offset: 4
// buffer-file-coding-system: utf-8-unix
// compile-command: "make -k -C .. 2>&1 | sed -e 's/\\/x\\//x:\\//g'";
// End: