rust/src/unicode.rs

use rustc::lint::*;
use rustc_front::hir::*;
use syntax::codemap::Span;

use syntax::ast::Lit_;

use unicode_normalization::UnicodeNormalization;

use utils::{snippet, span_help_and_lint};

/// **What it does:** This lint checks for the unicode zero-width space in the code.
///
/// **Why is this bad?** Having an invisible character in the code makes for all sorts of April fools, but otherwise is very much frowned upon.
///
/// **Known problems:** None
///
/// **Example:** You don't see it, but there may be a zero-width space somewhere in this text.
declare_lint! {
    pub ZERO_WIDTH_SPACE, Deny,
    "using a zero-width space in a string literal, which is confusing"
}

/// **What it does:** This lint checks for non-ascii characters in string literals.
///
/// **Why is this bad?** Yeah, we know, the 90's called and wanted their charset back. Even so, there still are editors and other programs out there that don't work well with unicode. So if the code is meant to be used internationally, on multiple operating systems, or has other portability requirements, activating this lint could be useful.
///
/// **Known problems:** None
///
/// **Example:** `let x = "Hä?"`
declare_lint! {
    pub NON_ASCII_LITERAL, Allow,
    "using any literal non-ASCII chars in a string literal; suggests \
     using the \\u escape instead"
}

/// **What it does:** This lint checks for string literals that contain unicode in a form that is not equal to its [NFC-recomposition](http://www.unicode.org/reports/tr15/#Norm_Forms).
///
/// **Why is this bad?** If such a string is compared to another, the results may be surprising.
///
/// **Known problems** None
///
/// **Example:** You may not see it, but "à" and "à" aren't the same string. The former when escaped is actually "a\u{300}" while the latter is "\u{e0}".
declare_lint! {
    pub UNICODE_NOT_NFC, Allow,
    "using a unicode literal not in NFC normal form (see \
     http://www.unicode.org/reports/tr15/ for further information)"
}


#[derive(Copy, Clone)]
pub struct Unicode;

impl LintPass for Unicode {
    fn get_lints(&self) -> LintArray {
        lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC)
    }
}

impl LateLintPass for Unicode {
    fn check_expr(&mut self, cx: &LateContext, expr: &Expr) {
        if let ExprLit(ref lit) = expr.node {
            if let Lit_::LitStr(_, _) = lit.node {
                check_str(cx, lit.span)
            }
        }
    }
}

fn escape<T: Iterator<Item = char>>(s: T) -> String {
    let mut result = String::new();
    for c in s {
        if c as u32 > 0x7F {
            for d in c.escape_unicode() {
                result.push(d)
            }
        } else {
            result.push(c);
        }
    }
    result
}

fn check_str(cx: &LateContext, span: Span) {
    let string = snippet(cx, span, "");
    if string.contains('\u{200B}') {
        span_help_and_lint(cx,
                           ZERO_WIDTH_SPACE,
                           span,
                           "zero-width space detected",
                           &format!("Consider replacing the string with:\n\"{}\"",
                                    string.replace("\u{200B}", "\\u{200B}")));
    }
    if string.chars().any(|c| c as u32 > 0x7F) {
        span_help_and_lint(cx,
                           NON_ASCII_LITERAL,
                           span,
                           "literal non-ASCII character detected",
                           &format!("Consider replacing the string with:\n\"{}\"",
                                    if cx.current_level(UNICODE_NOT_NFC) == Level::Allow {
                                        escape(string.chars())
                                    } else {
                                        escape(string.nfc())
                                    }));
    }
    if cx.current_level(NON_ASCII_LITERAL) == Level::Allow && string.chars().zip(string.nfc()).any(|(a, b)| a != b) {
        span_help_and_lint(cx,
                           UNICODE_NOT_NFC,
                           span,
                           "non-nfc unicode sequence detected",
                           &format!("Consider replacing the string with:\n\"{}\"", string.nfc().collect::<String>()));
    }
}
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`use rustc::lint::*;`
Update rust to 0efb9dab8c7c07fa28e9df0eccc5c07ea3c17fbb (HIR+lints, Thu Sep 3 18:59:56 2015 +0530) fixes #294 2015-09-03 09:42:17 -05:00			`use rustc_front::hir::*;`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`use syntax::codemap::Span;`
all: organize imports * remove unused imports * separate external and internal imports * consistent import of rustc::lint * move #[allow(unused_imports)] to local impl 2015-08-16 01:54:43 -05:00
fallout 2016-02-03 08:39:22 -06:00			`use syntax::ast::Lit_;`
Rustup to rustc 1.5.0-nightly (fc4d566b4 2015-09-16) fixes #334 2015-09-16 19:01:41 -05:00
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`use unicode_normalization::UnicodeNormalization;`

fixed dogfood by using snippet instead of the (escaped) literal string 2015-09-04 07:24:49 -05:00			`use utils::{snippet, span_help_and_lint};`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00
Remove redundancy in lint documentation The default level is always given in the declare_lint! macro, no need to add it inconsistently in the documentation. 2016-02-05 17:41:54 -06:00			`/// What it does: This lint checks for the unicode zero-width space in the code.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
			`/// Why is this bad? Having an invisible character in the code makes for all sorts of April fools, but otherwise is very much frowned upon.`
			`///`
			`/// Known problems: None`
			`///`
			`/// Example: You don't see it, but there may be a zero-width space somewhere in this text.`
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`declare_lint! {`
			`pub ZERO_WIDTH_SPACE, Deny,`
			`"using a zero-width space in a string literal, which is confusing"`
			`}`

Remove redundancy in lint documentation The default level is always given in the declare_lint! macro, no need to add it inconsistently in the documentation. 2016-02-05 17:41:54 -06:00			`/// What it does: This lint checks for non-ascii characters in string literals.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
			`/// Why is this bad? Yeah, we know, the 90's called and wanted their charset back. Even so, there still are editors and other programs out there that don't work well with unicode. So if the code is meant to be used internationally, on multiple operating systems, or has other portability requirements, activating this lint could be useful.`
			`///`
			`/// Known problems: None`
			`///`
			/// Example: `let x = "Hä?"`
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`declare_lint! {`
			`pub NON_ASCII_LITERAL, Allow,`
			`"using any literal non-ASCII chars in a string literal; suggests \`
			`using the \\u escape instead"`
			`}`

Remove redundancy in lint documentation The default level is always given in the declare_lint! macro, no need to add it inconsistently in the documentation. 2016-02-05 17:41:54 -06:00			`/// What it does: This lint checks for string literals that contain unicode in a form that is not equal to its [NFC-recomposition](http://www.unicode.org/reports/tr15/#Norm_Forms).`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
			`/// Why is this bad? If such a string is compared to another, the results may be surprising.`
			`///`
			`/// Known problems None`
			`///`
			`/// Example: You may not see it, but "à" and "à" aren't the same string. The former when escaped is actually "a\u{300}" while the latter is "\u{e0}".`
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`declare_lint! {`
			`pub UNICODE_NOT_NFC, Allow,`
			`"using a unicode literal not in NFC normal form (see \`
			`http://www.unicode.org/reports/tr15/ for further information)"`
			`}`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00
			`#[derive(Copy, Clone)]`
			`pub struct Unicode;`

			`impl LintPass for Unicode {`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`fn get_lints(&self) -> LintArray {`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC)`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`
Upgrade Rust to rustc 1.5.0-nightly (cff041170 2015-09-17) LintPass was split and ExprParen was removed from the HIR Fixes #338 2015-09-18 21:53:04 -05:00			`}`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00
Upgrade Rust to rustc 1.5.0-nightly (cff041170 2015-09-17) LintPass was split and ExprParen was removed from the HIR Fixes #338 2015-09-18 21:53:04 -05:00			`impl LateLintPass for Unicode {`
			`fn check_expr(&mut self, cx: &LateContext, expr: &Expr) {`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`if let ExprLit(ref lit) = expr.node {`
fallout 2016-02-03 08:39:22 -06:00			`if let Lit_::LitStr(_, _) = lit.node {`
fixed dogfood by using snippet instead of the (escaped) literal string 2015-09-04 07:24:49 -05:00			`check_str(cx, lit.span)`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`}`
			`}`
			`}`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`

fmt clippy 2016-01-03 22:26:12 -06:00			`fn escape<T: Iterator<Item = char>>(s: T) -> String {`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`let mut result = String::new();`
			`for c in s {`
unicode: add lint against non-ascii chars in literals (Allow by default), #85 2015-08-12 13:36:10 -05:00			`if c as u32 > 0x7F {`
fmt clippy 2016-01-03 22:26:12 -06:00			`for d in c.escape_unicode() {`
			`result.push(d)`
			`}`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`} else {`
			`result.push(c);`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`}`
			`}`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`result`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`

Upgrade Rust to rustc 1.5.0-nightly (cff041170 2015-09-17) LintPass was split and ExprParen was removed from the HIR Fixes #338 2015-09-18 21:53:04 -05:00			`fn check_str(cx: &LateContext, span: Span) {`
fixed dogfood by using snippet instead of the (escaped) literal string 2015-09-04 07:24:49 -05:00			`let string = snippet(cx, span, "");`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`if string.contains('\u{200B}') {`
fmt clippy 2016-01-03 22:26:12 -06:00			`span_help_and_lint(cx,`
			`ZERO_WIDTH_SPACE,`
			`span,`
			`"zero-width space detected",`
			`&format!("Consider replacing the string with:\n\"{}\"",`
			`string.replace("\u{200B}", "\\u{200B}")));`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`}`
			`if string.chars().any(\|c\| c as u32 > 0x7F) {`
fmt clippy 2016-01-03 22:26:12 -06:00			`span_help_and_lint(cx,`
			`NON_ASCII_LITERAL,`
			`span,`
			`"literal non-ASCII character detected",`
			`&format!("Consider replacing the string with:\n\"{}\"",`
			`if cx.current_level(UNICODE_NOT_NFC) == Level::Allow {`
			`escape(string.chars())`
			`} else {`
			`escape(string.nfc())`
			`}));`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`}`
fmt clippy 2016-01-03 22:26:12 -06:00			`if cx.current_level(NON_ASCII_LITERAL) == Level::Allow && string.chars().zip(string.nfc()).any(\|(a, b)\| a != b) {`
			`span_help_and_lint(cx,`
			`UNICODE_NOT_NFC,`
			`span,`
			`"non-nfc unicode sequence detected",`
			`&format!("Consider replacing the string with:\n\"{}\"", string.nfc().collect::<String>()));`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`}`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`