rust/clippy_lints/src/unicode.rs

use rustc::lint::*;
use rustc::hir::*;
use syntax::ast::{LitKind, NodeId};
use syntax::codemap::Span;
use unicode_normalization::UnicodeNormalization;
use utils::{is_allowed, snippet, span_help_and_lint};

/// **What it does:** Checks for the Unicode zero-width space in the code.
///
/// **Why is this bad?** Having an invisible character in the code makes for all
/// sorts of April fools, but otherwise is very much frowned upon.
///
/// **Known problems:** None.
///
/// **Example:** You don't see it, but there may be a zero-width space
/// somewhere in this text.
declare_clippy_lint! {
    pub ZERO_WIDTH_SPACE,
    correctness,
    "using a zero-width space in a string literal, which is confusing"
}

/// **What it does:** Checks for non-ASCII characters in string literals.
///
/// **Why is this bad?** Yeah, we know, the 90's called and wanted their charset
/// back. Even so, there still are editors and other programs out there that
/// don't work well with Unicode. So if the code is meant to be used
/// internationally, on multiple operating systems, or has other portability
/// requirements, activating this lint could be useful.
///
/// **Known problems:** None.
///
/// **Example:**
/// ```rust
/// let x = "Hä?"
/// ```
declare_clippy_lint! {
    pub NON_ASCII_LITERAL,
    pedantic,
    "using any literal non-ASCII chars in a string literal instead of \
     using the `\\u` escape"
}

/// **What it does:** Checks for string literals that contain Unicode in a form
/// that is not equal to its
/// [NFC-recomposition](http://www.unicode.org/reports/tr15/#Norm_Forms).
///
/// **Why is this bad?** If such a string is compared to another, the results
/// may be surprising.
///
/// **Known problems** None.
///
/// **Example:** You may not see it, but “à” and “à” aren't the same string. The
/// former when escaped is actually `"a\u{300}"` while the latter is `"\u{e0}"`.
declare_clippy_lint! {
    pub UNICODE_NOT_NFC,
    pedantic,
    "using a unicode literal not in NFC normal form (see \
     [unicode tr15](http://www.unicode.org/reports/tr15/) for further information)"
}


#[derive(Copy, Clone)]
pub struct Unicode;

impl LintPass for Unicode {
    fn get_lints(&self) -> LintArray {
        lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC)
    }
}

impl<'a, 'tcx> LateLintPass<'a, 'tcx> for Unicode {
    fn check_expr(&mut self, cx: &LateContext<'a, 'tcx>, expr: &'tcx Expr) {
        if let ExprLit(ref lit) = expr.node {
            if let LitKind::Str(_, _) = lit.node {
                check_str(cx, lit.span, expr.id)
            }
        }
    }
}

fn escape<T: Iterator<Item = char>>(s: T) -> String {
    let mut result = String::new();
    for c in s {
        if c as u32 > 0x7F {
            for d in c.escape_unicode() {
                result.push(d)
            }
        } else {
            result.push(c);
        }
    }
    result
}

fn check_str(cx: &LateContext, span: Span, id: NodeId) {
    let string = snippet(cx, span, "");
    if string.contains('\u{200B}') {
        span_help_and_lint(
            cx,
            ZERO_WIDTH_SPACE,
            span,
            "zero-width space detected",
            &format!(
                "Consider replacing the string with:\n\"{}\"",
                string.replace("\u{200B}", "\\u{200B}")
            ),
        );
    }
    if string.chars().any(|c| c as u32 > 0x7F) {
        span_help_and_lint(
            cx,
            NON_ASCII_LITERAL,
            span,
            "literal non-ASCII character detected",
            &format!(
                "Consider replacing the string with:\n\"{}\"",
                if is_allowed(cx, UNICODE_NOT_NFC, id) {
                    escape(string.chars())
                } else {
                    escape(string.nfc())
                }
            ),
        );
    }
    if is_allowed(cx, NON_ASCII_LITERAL, id) && string.chars().zip(string.nfc()).any(|(a, b)| a != b) {
        span_help_and_lint(
            cx,
            UNICODE_NOT_NFC,
            span,
            "non-nfc unicode sequence detected",
            &format!("Consider replacing the string with:\n\"{}\"", string.nfc().collect::<String>()),
        );
    }
}
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`use rustc::lint::*;`
Rustup to 1.9.0-nightly (bf5da36f1 2016-04-06) 2016-04-07 10:46:48 -05:00			`use rustc::hir::*;`
Rustup 2017-08-11 07:11:46 -05:00			`use syntax::ast::{LitKind, NodeId};`
Rustfmt and sort all `use` items 2016-02-24 10:38:57 -06:00			`use syntax::codemap::Span;`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`use unicode_normalization::UnicodeNormalization;`
Run nightly rustfmt 2017-09-05 04:33:04 -05:00			`use utils::{is_allowed, snippet, span_help_and_lint};`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// What it does: Checks for the Unicode zero-width space in the code.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Why is this bad? Having an invisible character in the code makes for all`
			`/// sorts of April fools, but otherwise is very much frowned upon.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Known problems: None.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Rustfmt 2017-08-09 02:30:56 -05:00			`/// Example: You don't see it, but there may be a zero-width space`
			`/// somewhere in this text.`
Categorize all the lints! 2018-03-28 08:24:26 -05:00			`declare_clippy_lint! {`
Make lint descriptions short and to the point; always fitting the column "triggers on". 2016-08-06 03:18:36 -05:00			`pub ZERO_WIDTH_SPACE,`
Categorize all the lints! 2018-03-28 08:24:26 -05:00			`correctness,`
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`"using a zero-width space in a string literal, which is confusing"`
			`}`

Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// What it does: Checks for non-ASCII characters in string literals.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Why is this bad? Yeah, we know, the 90's called and wanted their charset`
			`/// back. Even so, there still are editors and other programs out there that`
			`/// don't work well with Unicode. So if the code is meant to be used`
			`/// internationally, on multiple operating systems, or has other portability`
			`/// requirements, activating this lint could be useful.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Known problems: None.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Improve docs 2016-07-15 17:25:44 -05:00			`/// Example:`
			/// ```rust
			`/// let x = "Hä?"`
			/// ```
Categorize all the lints! 2018-03-28 08:24:26 -05:00			`declare_clippy_lint! {`
Make lint descriptions short and to the point; always fitting the column "triggers on". 2016-08-06 03:18:36 -05:00			`pub NON_ASCII_LITERAL,`
Categorize all the lints! 2018-03-28 08:24:26 -05:00			`pedantic,`
Make lint descriptions short and to the point; always fitting the column "triggers on". 2016-08-06 03:18:36 -05:00			`"using any literal non-ASCII chars in a string literal instead of \`
			using the `\\u` escape"
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`}`

Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// What it does: Checks for string literals that contain Unicode in a form`
			`/// that is not equal to its`
			`/// [NFC-recomposition](http://www.unicode.org/reports/tr15/#Norm_Forms).`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Why is this bad? If such a string is compared to another, the results`
			`/// may be surprising.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Known problems None.`
added wiki comments + wiki-generating python script 2015-12-10 18:22:27 -06:00			`///`
Make the lint docstrings more consistent. 2016-08-06 02:55:04 -05:00			`/// Example: You may not see it, but “à” and “à” aren't the same string. The`
			/// former when escaped is actually `"a\u{300}"` while the latter is `"\u{e0}"`.
Categorize all the lints! 2018-03-28 08:24:26 -05:00			`declare_clippy_lint! {`
Make lint descriptions short and to the point; always fitting the column "triggers on". 2016-08-06 03:18:36 -05:00			`pub UNICODE_NOT_NFC,`
Categorize all the lints! 2018-03-28 08:24:26 -05:00			`pedantic,`
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`"using a unicode literal not in NFC normal form (see \`
fix markdown generated from code 2016-05-06 09:09:05 -05:00			`[unicode tr15](http://www.unicode.org/reports/tr15/) for further information)"`
Fix util/update_wiki.py warnings and be consistent in declare_lint! invocations 2016-02-05 17:13:29 -06:00			`}`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00
			`#[derive(Copy, Clone)]`
			`pub struct Unicode;`

			`impl LintPass for Unicode {`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`fn get_lints(&self) -> LintArray {`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`lint_array!(ZERO_WIDTH_SPACE, NON_ASCII_LITERAL, UNICODE_NOT_NFC)`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`
Upgrade Rust to rustc 1.5.0-nightly (cff041170 2015-09-17) LintPass was split and ExprParen was removed from the HIR Fixes #338 2015-09-18 21:53:04 -05:00			`}`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00
update to the rust-PR that unblocks clippy 2016-12-07 06:13:40 -06:00			`impl<'a, 'tcx> LateLintPass<'a, 'tcx> for Unicode {`
			`fn check_expr(&mut self, cx: &LateContext<'a, 'tcx>, expr: &'tcx Expr) {`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`if let ExprLit(ref lit) = expr.node {`
fix nightly breakage 2016-02-12 11:35:44 -06:00			`if let LitKind::Str(_, _) = lit.node {`
Rustup 2017-08-11 07:11:46 -05:00			`check_str(cx, lit.span, expr.id)`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`}`
			`}`
			`}`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`

fmt clippy 2016-01-03 22:26:12 -06:00			`fn escape<T: Iterator<Item = char>>(s: T) -> String {`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`let mut result = String::new();`
			`for c in s {`
unicode: add lint against non-ascii chars in literals (Allow by default), #85 2015-08-12 13:36:10 -05:00			`if c as u32 > 0x7F {`
fmt clippy 2016-01-03 22:26:12 -06:00			`for d in c.escape_unicode() {`
			`result.push(d)`
			`}`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`} else {`
			`result.push(c);`
all: whitespace cleanup * 4-space indentation * no trailing whitespace * no tabs 2015-08-11 13:22:20 -05:00			`}`
			`}`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`result`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`

Rustup 2017-08-11 07:11:46 -05:00			`fn check_str(cx: &LateContext, span: Span, id: NodeId) {`
fixed dogfood by using snippet instead of the (escaped) literal string 2015-09-04 07:24:49 -05:00			`let string = snippet(cx, span, "");`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`if string.contains('\u{200B}') {`
Rustfmt 2017-08-09 02:30:56 -05:00			`span_help_and_lint(`
			`cx,`
			`ZERO_WIDTH_SPACE,`
			`span,`
			`"zero-width space detected",`
			`&format!(`
			`"Consider replacing the string with:\n\"{}\"",`
			`string.replace("\u{200B}", "\\u{200B}")`
			`),`
			`);`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`}`
			`if string.chars().any(\|c\| c as u32 > 0x7F) {`
Rustfmt 2017-08-09 02:30:56 -05:00			`span_help_and_lint(`
			`cx,`
			`NON_ASCII_LITERAL,`
			`span,`
			`"literal non-ASCII character detected",`
			`&format!(`
			`"Consider replacing the string with:\n\"{}\"",`
Rustup 2017-08-11 07:11:46 -05:00			`if is_allowed(cx, UNICODE_NOT_NFC, id) {`
Rustfmt 2017-08-09 02:30:56 -05:00			`escape(string.chars())`
			`} else {`
			`escape(string.nfc())`
			`}`
			`),`
			`);`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`}`
Rustup 2017-08-11 07:11:46 -05:00			`if is_allowed(cx, NON_ASCII_LITERAL, id) && string.chars().zip(string.nfc()).any(\|(a, b)\| a != b) {`
Rustfmt 2017-08-09 02:30:56 -05:00			`span_help_and_lint(`
			`cx,`
			`UNICODE_NOT_NFC,`
			`span,`
			`"non-nfc unicode sequence detected",`
			`&format!("Consider replacing the string with:\n\"{}\"", string.nfc().collect::<String>()),`
			`);`
Unicode lints, second attempt: Lint whole strings, help with replacement 2015-09-04 02:08:07 -05:00			`}`
first unicode lint: zero_width_space 2015-06-11 04:35:00 -05:00			`}`