rust/src/lib/ExtFmt.rs

/* The 'fmt' extension is modeled on the posix printf system.
 *
 * A posix conversion ostensibly looks like this:
 *
 * %[parameter][flags][width][.precision][length]type
 *
 * Given the different numeric type bestiary we have, we omit the 'length'
 * parameter and support slightly different conversions for 'type':
 *
 * %[parameter][flags][width][.precision]type
 *
 * we also only support translating-to-rust a tiny subset of the possible
 * combinations at the moment.
 */

import option.none;
import option.some;

/*
 * We have a CT (compile-time) module that parses format strings into a
 * sequence of conversions. From those conversions AST fragments are built
 * that call into properly-typed functions in the RT (run-time) module.  Each
 * of those run-time conversion functions accepts another conversion
 * description that specifies how to format its output.
 *
 * The building of the AST is currently done in a module inside the compiler,
 * but should migrate over here as the plugin interface is defined.
 */

// Functions used by the fmt extension at compile time
mod CT {
    tag signedness {
        signed;
        unsigned;
    }

    tag caseness {
        case_upper;
        case_lower;
    }

    tag ty {
        ty_bool;
        ty_str;
        ty_char;
        ty_int(signedness);
        ty_bits;
        ty_hex(caseness);
        // FIXME: More types
    }

    tag flag {
        flag_left_justify;
        flag_left_zero_pad;
        flag_space_for_sign;
        flag_sign_always;
        flag_alternate;
    }

    tag count {
        count_is(int);
        count_is_param(int);
        count_is_next_param;
        count_implied;
    }

    // A formatted conversion from an expression to a string
    type conv = rec(option.t[int] param,
                    vec[flag] flags,
                    count width,
                    count precision,
                    ty ty);

    // A fragment of the output sequence
    tag piece {
        piece_string(str);
        piece_conv(conv);
    }

    fn parse_fmt_string(str s) -> vec[piece] {
        let vec[piece] pieces = vec();
        auto lim = _str.byte_len(s);
        auto buf = "";

        fn flush_buf(str buf, &vec[piece] pieces) -> str {
            if (_str.byte_len(buf) > 0u) {
                auto piece = piece_string(buf);
                pieces += vec(piece);
            }
            ret "";
        }

        auto i = 0u;
        while (i < lim) {
            auto curr = _str.substr(s, i, 1u);
            if (_str.eq(curr, "%")) {
                i += 1u;
                if (i >= lim) {
                    log "unterminated conversion at end of string";
                    fail;
                }
                auto curr2 = _str.substr(s, i, 1u);
                if (_str.eq(curr2, "%")) {
                    i += 1u;
                } else {
                    buf = flush_buf(buf, pieces);
                    auto res = parse_conversion(s, i, lim);
                    pieces += vec(res._0);
                    i = res._1;
                }
            } else {
                buf += curr;
                i += 1u;
            }
        }
        buf = flush_buf(buf, pieces);
        ret pieces;
    }

    fn peek_num(str s, uint i, uint lim) -> option.t[tup(uint, uint)] {
        if (i >= lim) {
            ret none[tup(uint, uint)];
        }

        auto c = s.(i);
        if (!('0' as u8 <= c && c <= '9' as u8)) {
            ret option.none[tup(uint, uint)];
        }

        auto n = (c - ('0' as u8)) as uint;
        alt (peek_num(s, i + 1u, lim)) {
            case (none[tup(uint, uint)]) {
                ret some[tup(uint, uint)](tup(n, i + 1u));
            }
            case (some[tup(uint, uint)](?next)) {
                auto m = next._0;
                auto j = next._1;
                ret some[tup(uint, uint)](tup(n * 10u + m, j));
            }
        }

    }

    fn parse_conversion(str s, uint i, uint lim) -> tup(piece, uint) {
        auto parm = parse_parameter(s, i, lim);
        auto flags = parse_flags(s, parm._1, lim);
        auto width = parse_count(s, flags._1, lim);
        auto prec = parse_precision(s, width._1, lim);
        auto ty = parse_type(s, prec._1, lim);
        ret tup(piece_conv(rec(param = parm._0,
                               flags = flags._0,
                               width = width._0,
                               precision = prec._0,
                               ty = ty._0)),
                ty._1);
    }

    fn parse_parameter(str s, uint i, uint lim) -> tup(option.t[int], uint) {
        if (i >= lim) {
            ret tup(none[int], i);
        }

        auto num = peek_num(s, i, lim);
        alt (num) {
            case (none[tup(uint, uint)]) {
                ret tup(none[int], i);
            }
            case (some[tup(uint, uint)](?t)) {
                auto n = t._0;
                auto j = t._1;
                if (j < lim && s.(j) == '$' as u8) {
                    ret tup(some[int](n as int), j + 1u);
                }
                else {
                    ret tup(none[int], i);
                }
            }
        }
    }

    fn parse_flags(str s, uint i, uint lim) -> tup(vec[flag], uint) {
        let vec[flag] noflags = vec();

        if (i >= lim) {
            ret tup(noflags, i);
        }

        fn more_(flag f, str s, uint i, uint lim) -> tup(vec[flag], uint) {
            auto next = parse_flags(s, i + 1u, lim);
            auto rest = next._0;
            auto j = next._1;
            let vec[flag] curr = vec(f);
            ret tup(curr + rest, j);
        }

        auto more = bind more_(_, s, i, lim);

        auto f = s.(i);
        if (f == ('-' as u8)) {
            ret more(flag_left_justify);
        } else if (f == ('0' as u8)) {
            ret more(flag_left_zero_pad);
        } else if (f == (' ' as u8)) {
            ret more(flag_space_for_sign);
        } else if (f == ('+' as u8)) {
            ret more(flag_sign_always);
        } else if (f == ('#' as u8)) {
            ret more(flag_alternate);
        } else {
            ret tup(noflags, i);
        }
    }

    fn parse_count(str s, uint i, uint lim) -> tup(count, uint) {
        if (i >= lim) {
            ret tup(count_implied, i);
        }

        if (s.(i) == ('*' as u8)) {
            auto param = parse_parameter(s, i + 1u, lim);
            auto j = param._1;
            alt (param._0) {
                case (none[int]) {
                    ret tup(count_is_next_param, j);
                }
                case (some[int](?n)) {
                    ret tup(count_is_param(n), j);
                }
            }
        } else {
            auto num = peek_num(s, i, lim);
            alt (num) {
                case (none[tup(uint, uint)]) {
                    ret tup(count_implied, i);
                }
                case (some[tup(uint, uint)](?num)) {
                    ret tup(count_is(num._0 as int), num._1);
                }
            }
        }
    }

    fn parse_precision(str s, uint i, uint lim) -> tup(count, uint) {
        if (i >= lim) {
            ret tup(count_implied, i);
        }

        if (s.(i) == '.' as u8) {
            auto count = parse_count(s, i + 1u, lim);
            // If there were no digits specified, i.e. the precision
            // was ".", then the precision is 0
            alt (count._0) {
                case (count_implied) {
                    ret tup(count_is(0), count._1);
                }
                case (_) {
                    ret count;
                }
            }
        } else {
            ret tup(count_implied, i);
        }
    }

    fn parse_type(str s, uint i, uint lim) -> tup(ty, uint) {
        if (i >= lim) {
            log "missing type in conversion";
            fail;
        }

        auto t;
        auto tstr = _str.substr(s, i, 1u);
        if (_str.eq(tstr, "b")) {
            t = ty_bool;
        } else if (_str.eq(tstr, "s")) {
            t = ty_str;
        } else if (_str.eq(tstr, "c")) {
            t = ty_char;
        } else if (_str.eq(tstr, "d")
                   || _str.eq(tstr, "i")) {
            // TODO: Do we really want two signed types here?
            // How important is it to be printf compatible?
            t = ty_int(signed);
        } else if (_str.eq(tstr, "u")) {
            t = ty_int(unsigned);
        } else if (_str.eq(tstr, "x")) {
            t = ty_hex(case_lower);
        } else if (_str.eq(tstr, "X")) {
            t = ty_hex(case_upper);
        } else if (_str.eq(tstr, "t")) {
            t = ty_bits;
        } else {
            log "unknown type in conversion";
            fail;
        }

        ret tup(t, i + 1u);
    }
}

// Functions used by the fmt extension at runtime. For now there are a lot of
// decisions made a runtime. If it proves worthwhile then some of these
// conditions can be evaluated at compile-time. For now though it's cleaner to
// implement it this way, I think.
mod RT {

    tag flag {
        flag_left_justify;
        flag_left_zero_pad;
        flag_space_for_sign;
        flag_sign_always;
        flag_alternate;
        // FIXME: This is a hack to avoid creating 0-length vec exprs,
        // which have some difficulty typechecking currently. See
        // comments in front.extfmt.make_flags
        flag_none;
    }

    tag count {
        count_is(int);
        count_implied;
    }

    tag ty {
        ty_default;
        ty_bits;
        ty_hex_upper;
        ty_hex_lower;
    }

    // FIXME: May not want to use a vector here for flags;
    // instead just use a bool per flag
    type conv = rec(vec[flag] flags,
                    count width,
                    count precision,
                    ty ty);

    fn conv_int(&conv cv, int i) -> str {
        auto radix = 10u;
        auto prec = get_int_precision(cv);
        auto s = int_to_str_prec(i, radix, prec);
        if (0 <= i) {
            if (have_flag(cv.flags, flag_sign_always)) {
                s = "+" + s;
            } else if (have_flag(cv.flags, flag_space_for_sign)) {
                s = " " + s;
            }
        }
        ret pad(cv, s, pad_signed);
    }

    fn conv_uint(&conv cv, uint u) -> str {
        auto prec = get_int_precision(cv);
        auto res;
        alt (cv.ty) {
            case (ty_default) {
                res = uint_to_str_prec(u, 10u, prec);
            }
            case (ty_hex_lower) {
                res = uint_to_str_prec(u, 16u, prec);
            }
            case (ty_hex_upper) {
                res = _str.to_upper(uint_to_str_prec(u, 16u, prec));
            }
            case (ty_bits) {
                res = uint_to_str_prec(u, 2u, prec);
            }
        }
        ret pad(cv, res, pad_unsigned);
    }

    fn conv_bool(&conv cv, bool b) -> str {
        auto s;
        if (b) {
            s = "true";
        } else {
            s = "false";
        }
        // Run the boolean conversion through the string conversion logic,
        // giving it the same rules for precision, etc.
        ret conv_str(cv, s);
    }

    fn conv_char(&conv cv, char c) -> str {
        ret conv_str(cv, _str.from_char(c));
    }

    fn conv_str(&conv cv, str s) -> str {
        auto unpadded = s;
        alt (cv.precision) {
            case (count_implied) {
            }
            case (count_is(?max)) {
                // For strings, precision is the maximum characters displayed
                if (max as uint < _str.char_len(s)) {
                    // FIXME: substr works on bytes, not chars!
                    unpadded = _str.substr(s, 0u, max as uint);
                }
            }
        }
        ret pad(cv, unpadded, pad_nozero);
    }

    // Convert an int to string with minimum number of digits. If precision is
    // 0 and num is 0 then the result is the empty string.
    fn int_to_str_prec(int num, uint radix, uint prec) -> str {
        if (num < 0) {
            ret "-" + uint_to_str_prec((-num) as uint, radix, prec);
        } else {
            ret uint_to_str_prec(num as uint, radix, prec);
        }
    }

    // Convert a uint to string with a minimum number of digits.  If precision
    // is 0 and num is 0 then the result is the empty string. Could move this
    // to _uint, but it doesn't seem all that useful.
    fn uint_to_str_prec(uint num, uint radix, uint prec) -> str {
        auto s;

        if (prec == 0u && num == 0u) {
            s = "";
        } else {
            s = _uint.to_str(num, radix);
            auto len = _str.char_len(s);
            if (len < prec) {
                auto diff = prec - len;
                auto pad = str_init_elt('0', diff);
                s = pad + s;
            }
        }

        ret s;
    }

    fn get_int_precision(&conv cv) -> uint {
        alt (cv.precision) {
            case (count_is(?c)) {
                ret c as uint;
            }
            case (count_implied) {
                ret 1u;
            }
        }
    }

    // FIXME: This might be useful in _str, but needs to be utf8 safe first
    fn str_init_elt(char c, uint n_elts) -> str {
        auto svec = _vec.init_elt[u8](c as u8, n_elts);
        // FIXME: Using unsafe_from_bytes because rustboot
        // can't figure out the is_utf8 predicate on from_bytes?
        ret _str.unsafe_from_bytes(svec);
    }

    tag pad_mode {
        pad_signed;
        pad_unsigned;
        pad_nozero;
    }

    fn pad(&conv cv, str s, pad_mode mode) -> str {
        auto uwidth;
        alt (cv.width) {
            case (count_implied) {
                ret s;
            }
            case (count_is(?width)) {
                // FIXME: Maybe width should be uint
                uwidth = width as uint;
            }
        }

        auto strlen = _str.char_len(s);
        if (uwidth <= strlen) {
            ret s;
        }

        auto padchar = ' ';
        auto diff = uwidth - strlen;
        if (have_flag(cv.flags, flag_left_justify)) {
            auto padstr = str_init_elt(padchar, diff);
            ret s + padstr;
        }

        auto might_zero_pad = false;
        auto signed = false;

        alt (mode) {
            case (pad_nozero) {
                // fallthrough
            }
            case (pad_signed) {
                might_zero_pad = true;
                signed = true;
            }
            case (pad_unsigned) {
                might_zero_pad = true;
            }
        }

        fn have_precision(&conv cv) -> bool {
            alt (cv.precision) {
                case (count_implied) {
                    ret false;
                }
                case (_) {
                    ret true;
                }
            }
        }

        auto zero_padding = false;
        if (might_zero_pad
            && have_flag(cv.flags, flag_left_zero_pad)
            && !have_precision(cv)) {

            padchar = '0';
            zero_padding = true;
        }

        auto padstr = str_init_elt(padchar, diff);

        // This is completely heinous. If we have a signed value then
        // potentially rip apart the intermediate result and insert some
        // zeros. It may make sense to convert zero padding to a precision
        // instead.
        if (signed
            && zero_padding
            && _str.byte_len(s) > 0u
            && s.(0) == '-' as u8) {

            auto bytelen = _str.byte_len(s);
            auto numpart = _str.substr(s, 1u, bytelen - 1u);
            ret "-" + padstr + numpart;
        } else {
            ret padstr + s;
        }
    }

    fn have_flag(vec[flag] flags, flag f) -> bool {
        for (flag candidate in flags) {
            if (candidate == f) {
                ret true;
            }
        }
        ret false;
    }
}

// Local Variables:
// mode: rust;
// fill-column: 78;
// indent-tabs-mode: nil
// c-basic-offset: 4
// buffer-file-coding-system: utf-8-unix
// compile-command: "make -k -C .. 2>&1 | sed -e 's/\\/x\\//x:\\//g'";
// End: